diff --git a/src/gallium/drivers/panfrost/pan_blend_cso.h b/src/gallium/drivers/panfrost/pan_blend_cso.h index dd6c4a58c12..bf3c53d1afe 100644 --- a/src/gallium/drivers/panfrost/pan_blend_cso.h +++ b/src/gallium/drivers/panfrost/pan_blend_cso.h @@ -36,26 +36,26 @@ struct panfrost_bo; struct pan_blend_info { - unsigned constant_mask : 4; - bool fixed_function : 1; - bool enabled : 1; - bool load_dest : 1; - bool opaque : 1; - bool alpha_zero_nop : 1; - bool alpha_one_store : 1; + unsigned constant_mask : 4; + bool fixed_function : 1; + bool enabled : 1; + bool load_dest : 1; + bool opaque : 1; + bool alpha_zero_nop : 1; + bool alpha_one_store : 1; }; struct panfrost_blend_state { - struct pipe_blend_state base; - struct pan_blend_state pan; - struct pan_blend_info info[PIPE_MAX_COLOR_BUFS]; - uint32_t equation[PIPE_MAX_COLOR_BUFS]; + struct pipe_blend_state base; + struct pan_blend_state pan; + struct pan_blend_info info[PIPE_MAX_COLOR_BUFS]; + uint32_t equation[PIPE_MAX_COLOR_BUFS]; - /* info.load presented as a bitfield for draw call hot paths */ - unsigned load_dest_mask : PIPE_MAX_COLOR_BUFS; + /* info.load presented as a bitfield for draw call hot paths */ + unsigned load_dest_mask : PIPE_MAX_COLOR_BUFS; }; -mali_ptr -panfrost_get_blend(struct panfrost_batch *batch, unsigned rt, struct panfrost_bo **bo, unsigned *shader_offset); +mali_ptr panfrost_get_blend(struct panfrost_batch *batch, unsigned rt, + struct panfrost_bo **bo, unsigned *shader_offset); #endif diff --git a/src/gallium/drivers/panfrost/pan_blit.c b/src/gallium/drivers/panfrost/pan_blit.c index 7f059bd4aa4..190bab39574 100644 --- a/src/gallium/drivers/panfrost/pan_blit.c +++ b/src/gallium/drivers/panfrost/pan_blit.c @@ -27,59 +27,58 @@ * */ +#include "util/format/u_format.h" #include "pan_context.h" #include "pan_util.h" -#include "util/format/u_format.h" void panfrost_blitter_save(struct panfrost_context *ctx, bool render_cond) { - struct blitter_context *blitter = ctx->blitter; + struct blitter_context *blitter = ctx->blitter; - util_blitter_save_vertex_buffer_slot(blitter, ctx->vertex_buffers); - util_blitter_save_vertex_elements(blitter, ctx->vertex); - util_blitter_save_vertex_shader(blitter, ctx->uncompiled[PIPE_SHADER_VERTEX]); - util_blitter_save_rasterizer(blitter, ctx->rasterizer); - util_blitter_save_viewport(blitter, &ctx->pipe_viewport); - util_blitter_save_scissor(blitter, &ctx->scissor); - util_blitter_save_fragment_shader(blitter, ctx->uncompiled[PIPE_SHADER_FRAGMENT]); - util_blitter_save_blend(blitter, ctx->blend); - util_blitter_save_depth_stencil_alpha(blitter, ctx->depth_stencil); - util_blitter_save_stencil_ref(blitter, &ctx->stencil_ref); - util_blitter_save_so_targets(blitter, 0, NULL); - util_blitter_save_sample_mask(blitter, ctx->sample_mask, ctx->min_samples); + util_blitter_save_vertex_buffer_slot(blitter, ctx->vertex_buffers); + util_blitter_save_vertex_elements(blitter, ctx->vertex); + util_blitter_save_vertex_shader(blitter, + ctx->uncompiled[PIPE_SHADER_VERTEX]); + util_blitter_save_rasterizer(blitter, ctx->rasterizer); + util_blitter_save_viewport(blitter, &ctx->pipe_viewport); + util_blitter_save_scissor(blitter, &ctx->scissor); + util_blitter_save_fragment_shader(blitter, + ctx->uncompiled[PIPE_SHADER_FRAGMENT]); + util_blitter_save_blend(blitter, ctx->blend); + util_blitter_save_depth_stencil_alpha(blitter, ctx->depth_stencil); + util_blitter_save_stencil_ref(blitter, &ctx->stencil_ref); + util_blitter_save_so_targets(blitter, 0, NULL); + util_blitter_save_sample_mask(blitter, ctx->sample_mask, ctx->min_samples); - util_blitter_save_framebuffer(blitter, &ctx->pipe_framebuffer); - util_blitter_save_fragment_sampler_states(blitter, - ctx->sampler_count[PIPE_SHADER_FRAGMENT], - (void **)(&ctx->samplers[PIPE_SHADER_FRAGMENT])); - util_blitter_save_fragment_sampler_views(blitter, - ctx->sampler_view_count[PIPE_SHADER_FRAGMENT], - (struct pipe_sampler_view **)&ctx->sampler_views[PIPE_SHADER_FRAGMENT]); - util_blitter_save_fragment_constant_buffer_slot(blitter, - ctx->constant_buffer[PIPE_SHADER_FRAGMENT].cb); - - if (!render_cond) { - util_blitter_save_render_condition(blitter, - (struct pipe_query *) ctx->cond_query, - ctx->cond_cond, ctx->cond_mode); - } + util_blitter_save_framebuffer(blitter, &ctx->pipe_framebuffer); + util_blitter_save_fragment_sampler_states( + blitter, ctx->sampler_count[PIPE_SHADER_FRAGMENT], + (void **)(&ctx->samplers[PIPE_SHADER_FRAGMENT])); + util_blitter_save_fragment_sampler_views( + blitter, ctx->sampler_view_count[PIPE_SHADER_FRAGMENT], + (struct pipe_sampler_view **)&ctx->sampler_views[PIPE_SHADER_FRAGMENT]); + util_blitter_save_fragment_constant_buffer_slot( + blitter, ctx->constant_buffer[PIPE_SHADER_FRAGMENT].cb); + if (!render_cond) { + util_blitter_save_render_condition(blitter, + (struct pipe_query *)ctx->cond_query, + ctx->cond_cond, ctx->cond_mode); + } } void -panfrost_blit(struct pipe_context *pipe, - const struct pipe_blit_info *info) +panfrost_blit(struct pipe_context *pipe, const struct pipe_blit_info *info) { - struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_context *ctx = pan_context(pipe); - if (info->render_condition_enable && - !panfrost_render_condition_check(ctx)) - return; + if (info->render_condition_enable && !panfrost_render_condition_check(ctx)) + return; - if (!util_blitter_is_blit_supported(ctx->blitter, info)) - unreachable("Unsupported blit\n"); + if (!util_blitter_is_blit_supported(ctx->blitter, info)) + unreachable("Unsupported blit\n"); - panfrost_blitter_save(ctx, info->render_condition_enable); - util_blitter_blit(ctx->blitter, info); + panfrost_blitter_save(ctx, info->render_condition_enable); + util_blitter_blit(ctx->blitter, info); } diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index c082687e969..53534de041b 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -23,158 +23,170 @@ * SOFTWARE. */ -#include "util/macros.h" -#include "util/u_prim.h" -#include "util/u_vbuf.h" -#include "util/u_helpers.h" -#include "util/u_draw.h" -#include "util/u_memory.h" -#include "util/u_viewport.h" +#include "gallium/auxiliary/util/u_blend.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" -#include "gallium/auxiliary/util/u_blend.h" +#include "util/macros.h" +#include "util/u_draw.h" +#include "util/u_helpers.h" +#include "util/u_memory.h" +#include "util/u_prim.h" +#include "util/u_vbuf.h" +#include "util/u_viewport.h" #include "genxml/gen_macros.h" -#include "pan_pool.h" -#include "pan_bo.h" #include "pan_blend.h" +#include "pan_blitter.h" +#include "pan_bo.h" #include "pan_context.h" +#include "pan_indirect_dispatch.h" #include "pan_job.h" +#include "pan_pool.h" #include "pan_shader.h" #include "pan_texture.h" #include "pan_util.h" -#include "pan_indirect_dispatch.h" -#include "pan_blitter.h" #define PAN_GPU_INDIRECTS (PAN_ARCH == 7) struct panfrost_rasterizer { - struct pipe_rasterizer_state base; + struct pipe_rasterizer_state base; #if PAN_ARCH <= 7 - /* Partially packed RSD words */ - struct mali_multisample_misc_packed multisample; - struct mali_stencil_mask_misc_packed stencil_misc; + /* Partially packed RSD words */ + struct mali_multisample_misc_packed multisample; + struct mali_stencil_mask_misc_packed stencil_misc; #endif }; struct panfrost_zsa_state { - struct pipe_depth_stencil_alpha_state base; + struct pipe_depth_stencil_alpha_state base; - /* Is any depth, stencil, or alpha testing enabled? */ - bool enabled; + /* Is any depth, stencil, or alpha testing enabled? */ + bool enabled; - /* Does the depth and stencil tests always pass? This ignores write - * masks, we are only interested in whether pixels may be killed. - */ - bool zs_always_passes; + /* Does the depth and stencil tests always pass? This ignores write + * masks, we are only interested in whether pixels may be killed. + */ + bool zs_always_passes; - /* Are depth or stencil writes possible? */ - bool writes_zs; + /* Are depth or stencil writes possible? */ + bool writes_zs; #if PAN_ARCH <= 7 - /* Prepacked words from the RSD */ - struct mali_multisample_misc_packed rsd_depth; - struct mali_stencil_mask_misc_packed rsd_stencil; - struct mali_stencil_packed stencil_front, stencil_back; + /* Prepacked words from the RSD */ + struct mali_multisample_misc_packed rsd_depth; + struct mali_stencil_mask_misc_packed rsd_stencil; + struct mali_stencil_packed stencil_front, stencil_back; #else - /* Depth/stencil descriptor template */ - struct mali_depth_stencil_packed desc; + /* Depth/stencil descriptor template */ + struct mali_depth_stencil_packed desc; #endif }; struct panfrost_sampler_state { - struct pipe_sampler_state base; - struct mali_sampler_packed hw; + struct pipe_sampler_state base; + struct mali_sampler_packed hw; }; /* Misnomer: Sampler view corresponds to textures, not samplers */ struct panfrost_sampler_view { - struct pipe_sampler_view base; - struct panfrost_pool_ref state; - struct mali_texture_packed bifrost_descriptor; - mali_ptr texture_bo; - uint64_t modifier; + struct pipe_sampler_view base; + struct panfrost_pool_ref state; + struct mali_texture_packed bifrost_descriptor; + mali_ptr texture_bo; + uint64_t modifier; - /* Pool used to allocate the descriptor. If NULL, defaults to the global - * descriptor pool. Can be set for short lived descriptors, useful for - * shader images on Valhall. - */ - struct panfrost_pool *pool; + /* Pool used to allocate the descriptor. If NULL, defaults to the global + * descriptor pool. Can be set for short lived descriptors, useful for + * shader images on Valhall. + */ + struct panfrost_pool *pool; }; struct panfrost_vertex_state { - unsigned num_elements; - struct pipe_vertex_element pipe[PIPE_MAX_ATTRIBS]; + unsigned num_elements; + struct pipe_vertex_element pipe[PIPE_MAX_ATTRIBS]; #if PAN_ARCH >= 9 - /* Packed attribute descriptor. All fields are set at CSO create time - * except for stride, which must be ORed in at draw time - */ - struct mali_attribute_packed attributes[PIPE_MAX_ATTRIBS]; + /* Packed attribute descriptor. All fields are set at CSO create time + * except for stride, which must be ORed in at draw time + */ + struct mali_attribute_packed attributes[PIPE_MAX_ATTRIBS]; #else - /* buffers corresponds to attribute buffer, element_buffers corresponds - * to an index in buffers for each vertex element */ - struct pan_vertex_buffer buffers[PIPE_MAX_ATTRIBS]; - unsigned element_buffer[PIPE_MAX_ATTRIBS]; - unsigned nr_bufs; + /* buffers corresponds to attribute buffer, element_buffers corresponds + * to an index in buffers for each vertex element */ + struct pan_vertex_buffer buffers[PIPE_MAX_ATTRIBS]; + unsigned element_buffer[PIPE_MAX_ATTRIBS]; + unsigned nr_bufs; - unsigned formats[PIPE_MAX_ATTRIBS]; + unsigned formats[PIPE_MAX_ATTRIBS]; #endif }; /* Statically assert that PIPE_* enums match the hardware enums. * (As long as they match, we don't need to translate them.) */ -static_assert((int)PIPE_FUNC_NEVER == MALI_FUNC_NEVER, "must match"); -static_assert((int)PIPE_FUNC_LESS == MALI_FUNC_LESS, "must match"); -static_assert((int)PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL, "must match"); -static_assert((int)PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL, "must match"); -static_assert((int)PIPE_FUNC_GREATER == MALI_FUNC_GREATER, "must match"); +static_assert((int)PIPE_FUNC_NEVER == MALI_FUNC_NEVER, "must match"); +static_assert((int)PIPE_FUNC_LESS == MALI_FUNC_LESS, "must match"); +static_assert((int)PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL, "must match"); +static_assert((int)PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL, "must match"); +static_assert((int)PIPE_FUNC_GREATER == MALI_FUNC_GREATER, "must match"); static_assert((int)PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL, "must match"); -static_assert((int)PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL, "must match"); -static_assert((int)PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS, "must match"); +static_assert((int)PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL, "must match"); +static_assert((int)PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS, "must match"); static inline enum mali_sample_pattern panfrost_sample_pattern(unsigned samples) { - switch (samples) { - case 1: return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED; - case 4: return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID; - case 8: return MALI_SAMPLE_PATTERN_D3D_8X_GRID; - case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID; - default: unreachable("Unsupported sample count"); - } + switch (samples) { + case 1: + return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED; + case 4: + return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID; + case 8: + return MALI_SAMPLE_PATTERN_D3D_8X_GRID; + case 16: + return MALI_SAMPLE_PATTERN_D3D_16X_GRID; + default: + unreachable("Unsupported sample count"); + } } static unsigned translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest) { - /* CLAMP is only supported on Midgard, where it is broken for nearest - * filtering. Use CLAMP_TO_EDGE in that case. - */ + /* CLAMP is only supported on Midgard, where it is broken for nearest + * filtering. Use CLAMP_TO_EDGE in that case. + */ - switch (w) { - case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER; - case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER; + switch (w) { + case PIPE_TEX_WRAP_REPEAT: + return MALI_WRAP_MODE_REPEAT; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return MALI_WRAP_MODE_CLAMP_TO_EDGE; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + return MALI_WRAP_MODE_CLAMP_TO_BORDER; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return MALI_WRAP_MODE_MIRRORED_REPEAT; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER; #if PAN_ARCH <= 5 - case PIPE_TEX_WRAP_CLAMP: - return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE : - MALI_WRAP_MODE_CLAMP; - case PIPE_TEX_WRAP_MIRROR_CLAMP: - return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE : - MALI_WRAP_MODE_MIRRORED_CLAMP; + case PIPE_TEX_WRAP_CLAMP: + return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE + : MALI_WRAP_MODE_CLAMP; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE + : MALI_WRAP_MODE_MIRRORED_CLAMP; #endif - default: unreachable("Invalid wrap"); - } + default: + unreachable("Invalid wrap"); + } } /* The hardware compares in the wrong order order, so we have to flip before @@ -183,121 +195,123 @@ translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest) static enum mali_func panfrost_sampler_compare_func(const struct pipe_sampler_state *cso) { - return !cso->compare_mode ? MALI_FUNC_NEVER : - panfrost_flip_compare_func((enum mali_func) cso->compare_func); + return !cso->compare_mode + ? MALI_FUNC_NEVER + : panfrost_flip_compare_func((enum mali_func)cso->compare_func); } static enum mali_mipmap_mode pan_pipe_to_mipmode(enum pipe_tex_mipfilter f) { - switch (f) { - case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST; - case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR; + switch (f) { + case PIPE_TEX_MIPFILTER_NEAREST: + return MALI_MIPMAP_MODE_NEAREST; + case PIPE_TEX_MIPFILTER_LINEAR: + return MALI_MIPMAP_MODE_TRILINEAR; #if PAN_ARCH >= 6 - case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE; + case PIPE_TEX_MIPFILTER_NONE: + return MALI_MIPMAP_MODE_NONE; #else - case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NEAREST; + case PIPE_TEX_MIPFILTER_NONE: + return MALI_MIPMAP_MODE_NEAREST; #endif - default: unreachable("Invalid"); - } + default: + unreachable("Invalid"); + } } - static void * -panfrost_create_sampler_state( - struct pipe_context *pctx, - const struct pipe_sampler_state *cso) +panfrost_create_sampler_state(struct pipe_context *pctx, + const struct pipe_sampler_state *cso) { - struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state); - so->base = *cso; + struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state); + so->base = *cso; #if PAN_ARCH == 7 - /* On v7, pan_texture.c composes the API swizzle with a bijective - * swizzle derived from the format, to allow more formats than the - * hardware otherwise supports. When packing border colours, we need to - * undo this bijection, by swizzling with its inverse. - */ - unsigned mali_format = panfrost_pipe_format_v7[cso->border_color_format].hw; - enum mali_rgb_component_order order = mali_format & BITFIELD_MASK(12); + /* On v7, pan_texture.c composes the API swizzle with a bijective + * swizzle derived from the format, to allow more formats than the + * hardware otherwise supports. When packing border colours, we need to + * undo this bijection, by swizzling with its inverse. + */ + unsigned mali_format = panfrost_pipe_format_v7[cso->border_color_format].hw; + enum mali_rgb_component_order order = mali_format & BITFIELD_MASK(12); - unsigned char inverted_swizzle[4]; - panfrost_invert_swizzle(GENX(pan_decompose_swizzle)(order).post, - inverted_swizzle); + unsigned char inverted_swizzle[4]; + panfrost_invert_swizzle(GENX(pan_decompose_swizzle)(order).post, + inverted_swizzle); - util_format_apply_color_swizzle(&so->base.border_color, - &cso->border_color, - inverted_swizzle, - false /* is_integer (irrelevant) */); + util_format_apply_color_swizzle(&so->base.border_color, &cso->border_color, + inverted_swizzle, + false /* is_integer (irrelevant) */); #endif - bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST; + bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST; - pan_pack(&so->hw, SAMPLER, cfg) { - cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST; - cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST; + pan_pack(&so->hw, SAMPLER, cfg) { + cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST; + cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST; - cfg.normalized_coordinates = !cso->unnormalized_coords; - cfg.lod_bias = FIXED_16(cso->lod_bias, true); - cfg.minimum_lod = FIXED_16(cso->min_lod, false); - cfg.maximum_lod = FIXED_16(cso->max_lod, false); + cfg.normalized_coordinates = !cso->unnormalized_coords; + cfg.lod_bias = FIXED_16(cso->lod_bias, true); + cfg.minimum_lod = FIXED_16(cso->min_lod, false); + cfg.maximum_lod = FIXED_16(cso->max_lod, false); - cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest); - cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest); - cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest); + cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest); + cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest); + cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest); - cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter); - cfg.compare_function = panfrost_sampler_compare_func(cso); - cfg.seamless_cube_map = cso->seamless_cube_map; + cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter); + cfg.compare_function = panfrost_sampler_compare_func(cso); + cfg.seamless_cube_map = cso->seamless_cube_map; - cfg.border_color_r = so->base.border_color.ui[0]; - cfg.border_color_g = so->base.border_color.ui[1]; - cfg.border_color_b = so->base.border_color.ui[2]; - cfg.border_color_a = so->base.border_color.ui[3]; + cfg.border_color_r = so->base.border_color.ui[0]; + cfg.border_color_g = so->base.border_color.ui[1]; + cfg.border_color_b = so->base.border_color.ui[2]; + cfg.border_color_a = so->base.border_color.ui[3]; #if PAN_ARCH >= 6 - if (cso->max_anisotropy > 1) { - cfg.maximum_anisotropy = cso->max_anisotropy; - cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC; - } + if (cso->max_anisotropy > 1) { + cfg.maximum_anisotropy = cso->max_anisotropy; + cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC; + } #else - /* Emulate disabled mipmapping by clamping the LOD as tight as - * possible (from 0 to epsilon = 1/256) */ - if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) - cfg.maximum_lod = cfg.minimum_lod + 1; + /* Emulate disabled mipmapping by clamping the LOD as tight as + * possible (from 0 to epsilon = 1/256) */ + if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) + cfg.maximum_lod = cfg.minimum_lod + 1; #endif - } + } - return so; + return so; } static bool -panfrost_fs_required( - struct panfrost_compiled_shader *fs, - struct panfrost_blend_state *blend, - struct pipe_framebuffer_state *state, - const struct panfrost_zsa_state *zsa) +panfrost_fs_required(struct panfrost_compiled_shader *fs, + struct panfrost_blend_state *blend, + struct pipe_framebuffer_state *state, + const struct panfrost_zsa_state *zsa) { - /* If we generally have side effects. This inclues use of discard, - * which can affect the results of an occlusion query. */ - if (fs->info.fs.sidefx) - return true; + /* If we generally have side effects. This inclues use of discard, + * which can affect the results of an occlusion query. */ + if (fs->info.fs.sidefx) + return true; - /* Using an empty FS requires early-z to be enabled, but alpha test - * needs it disabled. Alpha test is only native on Midgard, so only - * check there. - */ - if (PAN_ARCH <= 5 && zsa->base.alpha_func != PIPE_FUNC_ALWAYS) - return true; + /* Using an empty FS requires early-z to be enabled, but alpha test + * needs it disabled. Alpha test is only native on Midgard, so only + * check there. + */ + if (PAN_ARCH <= 5 && zsa->base.alpha_func != PIPE_FUNC_ALWAYS) + return true; - /* If colour is written we need to execute */ - for (unsigned i = 0; i < state->nr_cbufs; ++i) { - if (state->cbufs[i] && blend->info[i].enabled) - return true; - } + /* If colour is written we need to execute */ + for (unsigned i = 0; i < state->nr_cbufs; ++i) { + if (state->cbufs[i] && blend->info[i].enabled) + return true; + } - /* If depth is written and not implied we need to execute. - * TODO: Predicate on Z/S writes being enabled */ - return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil); + /* If depth is written and not implied we need to execute. + * TODO: Predicate on Z/S writes being enabled */ + return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil); } /* Get pointers to the blend shaders bound to each active render target. Used @@ -308,34 +322,34 @@ static void panfrost_get_blend_shaders(struct panfrost_batch *batch, mali_ptr *blend_shaders) { - unsigned shader_offset = 0; - struct panfrost_bo *shader_bo = NULL; + unsigned shader_offset = 0; + struct panfrost_bo *shader_bo = NULL; - for (unsigned c = 0; c < batch->key.nr_cbufs; ++c) { - if (batch->key.cbufs[c]) { - blend_shaders[c] = panfrost_get_blend(batch, - c, &shader_bo, &shader_offset); - } - } + for (unsigned c = 0; c < batch->key.nr_cbufs; ++c) { + if (batch->key.cbufs[c]) { + blend_shaders[c] = + panfrost_get_blend(batch, c, &shader_bo, &shader_offset); + } + } - if (shader_bo) - perf_debug_ctx(batch->ctx, "Blend shader use"); + if (shader_bo) + perf_debug_ctx(batch->ctx, "Blend shader use"); } #if PAN_ARCH >= 5 UNUSED static uint16_t pack_blend_constant(enum pipe_format format, float cons) { - const struct util_format_description *format_desc = - util_format_description(format); + const struct util_format_description *format_desc = + util_format_description(format); - unsigned chan_size = 0; + unsigned chan_size = 0; - for (unsigned i = 0; i < format_desc->nr_channels; i++) - chan_size = MAX2(format_desc->channel[0].size, chan_size); + for (unsigned i = 0; i < format_desc->nr_channels; i++) + chan_size = MAX2(format_desc->channel[0].size, chan_size); - uint16_t unorm = (cons * ((1 << chan_size) - 1)); - return unorm << (16 - chan_size); + uint16_t unorm = (cons * ((1 << chan_size) - 1)); + return unorm << (16 - chan_size); } /* @@ -349,163 +363,160 @@ pack_blend_constant(enum pipe_format format, float cons) static bool panfrost_overdraw_alpha(const struct panfrost_context *ctx, bool zero) { - const struct panfrost_blend_state *so = ctx->blend; + const struct panfrost_blend_state *so = ctx->blend; - for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) { - const struct pan_blend_info info = so->info[i]; + for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) { + const struct pan_blend_info info = so->info[i]; - bool enabled = ctx->pipe_framebuffer.cbufs[i] && !info.enabled; - bool flag = zero ? info.alpha_zero_nop : info.alpha_one_store; + bool enabled = ctx->pipe_framebuffer.cbufs[i] && !info.enabled; + bool flag = zero ? info.alpha_zero_nop : info.alpha_one_store; - if (enabled && !flag) - return false; - } + if (enabled && !flag) + return false; + } - return true; + return true; } static void -panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_shaders) +panfrost_emit_blend(struct panfrost_batch *batch, void *rts, + mali_ptr *blend_shaders) { - unsigned rt_count = batch->key.nr_cbufs; - struct panfrost_context *ctx = batch->ctx; - const struct panfrost_blend_state *so = ctx->blend; - bool dithered = so->base.dither; + unsigned rt_count = batch->key.nr_cbufs; + struct panfrost_context *ctx = batch->ctx; + const struct panfrost_blend_state *so = ctx->blend; + bool dithered = so->base.dither; - /* Always have at least one render target for depth-only passes */ - for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) { - struct mali_blend_packed *packed = rts + (i * pan_size(BLEND)); + /* Always have at least one render target for depth-only passes */ + for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) { + struct mali_blend_packed *packed = rts + (i * pan_size(BLEND)); - /* Disable blending for unbacked render targets */ - if (rt_count == 0 || !batch->key.cbufs[i] || !so->info[i].enabled) { - pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) { - cfg.enable = false; + /* Disable blending for unbacked render targets */ + if (rt_count == 0 || !batch->key.cbufs[i] || !so->info[i].enabled) { + pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) { + cfg.enable = false; #if PAN_ARCH >= 6 - cfg.internal.mode = MALI_BLEND_MODE_OFF; + cfg.internal.mode = MALI_BLEND_MODE_OFF; #endif - } + } - continue; - } + continue; + } - struct pan_blend_info info = so->info[i]; - enum pipe_format format = batch->key.cbufs[i]->format; - float cons = pan_blend_get_constant(info.constant_mask, - ctx->blend_color.color); + struct pan_blend_info info = so->info[i]; + enum pipe_format format = batch->key.cbufs[i]->format; + float cons = + pan_blend_get_constant(info.constant_mask, ctx->blend_color.color); - /* Word 0: Flags and constant */ - pan_pack(packed, BLEND, cfg) { - cfg.srgb = util_format_is_srgb(format); - cfg.load_destination = info.load_dest; - cfg.round_to_fb_precision = !dithered; - cfg.alpha_to_one = ctx->blend->base.alpha_to_one; + /* Word 0: Flags and constant */ + pan_pack(packed, BLEND, cfg) { + cfg.srgb = util_format_is_srgb(format); + cfg.load_destination = info.load_dest; + cfg.round_to_fb_precision = !dithered; + cfg.alpha_to_one = ctx->blend->base.alpha_to_one; #if PAN_ARCH >= 6 - if (!blend_shaders[i]) - cfg.constant = pack_blend_constant(format, cons); + if (!blend_shaders[i]) + cfg.constant = pack_blend_constant(format, cons); #else - cfg.blend_shader = (blend_shaders[i] != 0); + cfg.blend_shader = (blend_shaders[i] != 0); - if (blend_shaders[i]) - cfg.shader_pc = blend_shaders[i]; - else - cfg.constant = cons; + if (blend_shaders[i]) + cfg.shader_pc = blend_shaders[i]; + else + cfg.constant = cons; #endif - } + } - if (!blend_shaders[i]) { - /* Word 1: Blend Equation */ - STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4); - packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i]; - } + if (!blend_shaders[i]) { + /* Word 1: Blend Equation */ + STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4); + packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i]; + } #if PAN_ARCH >= 6 - const struct panfrost_device *dev = pan_device(ctx->base.screen); - struct panfrost_compiled_shader *fs = - ctx->prog[PIPE_SHADER_FRAGMENT]; + const struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; - /* Words 2 and 3: Internal blend */ - if (blend_shaders[i]) { - /* The blend shader's address needs to be at - * the same top 32 bit as the fragment shader. - * TODO: Ensure that's always the case. - */ - assert(!fs->bin.bo || - (blend_shaders[i] & (0xffffffffull << 32)) == - (fs->bin.gpu & (0xffffffffull << 32))); + /* Words 2 and 3: Internal blend */ + if (blend_shaders[i]) { + /* The blend shader's address needs to be at + * the same top 32 bit as the fragment shader. + * TODO: Ensure that's always the case. + */ + assert(!fs->bin.bo || (blend_shaders[i] & (0xffffffffull << 32)) == + (fs->bin.gpu & (0xffffffffull << 32))); - pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) { - cfg.mode = MALI_BLEND_MODE_SHADER; - cfg.shader.pc = (u32) blend_shaders[i]; + pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) { + cfg.mode = MALI_BLEND_MODE_SHADER; + cfg.shader.pc = (u32)blend_shaders[i]; #if PAN_ARCH <= 7 - unsigned ret_offset = fs->info.bifrost.blend[i].return_offset; - assert(!(ret_offset & 0x7)); + unsigned ret_offset = fs->info.bifrost.blend[i].return_offset; + assert(!(ret_offset & 0x7)); - cfg.shader.return_value = ret_offset ? - fs->bin.gpu + ret_offset : 0; + cfg.shader.return_value = ret_offset ? fs->bin.gpu + ret_offset : 0; #endif - } - } else { - pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) { - cfg.mode = info.opaque ? - MALI_BLEND_MODE_OPAQUE : - MALI_BLEND_MODE_FIXED_FUNCTION; + } + } else { + pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) { + cfg.mode = info.opaque ? MALI_BLEND_MODE_OPAQUE + : MALI_BLEND_MODE_FIXED_FUNCTION; - /* If we want the conversion to work properly, - * num_comps must be set to 4 - */ - cfg.fixed_function.num_comps = 4; - cfg.fixed_function.conversion.memory_format = - panfrost_format_to_bifrost_blend(dev, format, dithered); - cfg.fixed_function.rt = i; + /* If we want the conversion to work properly, + * num_comps must be set to 4 + */ + cfg.fixed_function.num_comps = 4; + cfg.fixed_function.conversion.memory_format = + panfrost_format_to_bifrost_blend(dev, format, dithered); + cfg.fixed_function.rt = i; #if PAN_ARCH <= 7 - if (!info.opaque) { - cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop; - cfg.fixed_function.alpha_one_store = info.alpha_one_store; - } + if (!info.opaque) { + cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop; + cfg.fixed_function.alpha_one_store = info.alpha_one_store; + } - if (fs->info.fs.untyped_color_outputs) { - cfg.fixed_function.conversion.register_format = - GENX(pan_fixup_blend_type)(fs->info.bifrost.blend[i].type, format); - } else { - cfg.fixed_function.conversion.register_format = - fs->info.bifrost.blend[i].format; - } + if (fs->info.fs.untyped_color_outputs) { + cfg.fixed_function.conversion.register_format = GENX( + pan_fixup_blend_type)(fs->info.bifrost.blend[i].type, format); + } else { + cfg.fixed_function.conversion.register_format = + fs->info.bifrost.blend[i].format; + } #endif - } - } + } + } #endif - } + } } #endif static inline bool -pan_allow_forward_pixel_to_kill(struct panfrost_context *ctx, struct panfrost_compiled_shader *fs) +pan_allow_forward_pixel_to_kill(struct panfrost_context *ctx, + struct panfrost_compiled_shader *fs) { - /* Track if any colour buffer is reused across draws, either - * from reading it directly, or from failing to write it - */ - unsigned rt_mask = ctx->fb_rt_mask; - uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0); - bool blend_reads_dest = (ctx->blend->load_dest_mask & rt_mask); - bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage; + /* Track if any colour buffer is reused across draws, either + * from reading it directly, or from failing to write it + */ + unsigned rt_mask = ctx->fb_rt_mask; + uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0); + bool blend_reads_dest = (ctx->blend->load_dest_mask & rt_mask); + bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage; - return fs->info.fs.can_fpk && - !(rt_mask & ~rt_written) && - !alpha_to_coverage && - !blend_reads_dest; + return fs->info.fs.can_fpk && !(rt_mask & ~rt_written) && + !alpha_to_coverage && !blend_reads_dest; } static mali_ptr -panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage) +panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, + enum pipe_shader_type stage) { - struct panfrost_compiled_shader *ss = batch->ctx->prog[stage]; + struct panfrost_compiled_shader *ss = batch->ctx->prog[stage]; - panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX); - panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX); + panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX); + panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX); - return ss->state.gpu; + return ss->state.gpu; } #if PAN_ARCH <= 7 @@ -515,161 +526,159 @@ panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader static void pan_merge_empty_fs(struct mali_renderer_state_packed *rsd) { - struct mali_renderer_state_packed empty_rsd; + struct mali_renderer_state_packed empty_rsd; - pan_pack(&empty_rsd, RENDERER_STATE, cfg) { + pan_pack(&empty_rsd, RENDERER_STATE, cfg) { #if PAN_ARCH >= 6 - cfg.properties.shader_modifies_coverage = true; - cfg.properties.allow_forward_pixel_to_kill = true; - cfg.properties.allow_forward_pixel_to_be_killed = true; - cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; + cfg.properties.shader_modifies_coverage = true; + cfg.properties.allow_forward_pixel_to_kill = true; + cfg.properties.allow_forward_pixel_to_be_killed = true; + cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; - /* Alpha isn't written so these are vacuous */ - cfg.multisample_misc.overdraw_alpha0 = true; - cfg.multisample_misc.overdraw_alpha1 = true; + /* Alpha isn't written so these are vacuous */ + cfg.multisample_misc.overdraw_alpha0 = true; + cfg.multisample_misc.overdraw_alpha1 = true; #else - cfg.shader.shader = 0x1; - cfg.properties.work_register_count = 1; - cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION; - cfg.properties.force_early_z = true; + cfg.shader.shader = 0x1; + cfg.properties.work_register_count = 1; + cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION; + cfg.properties.force_early_z = true; #endif - } + } - pan_merge((*rsd), empty_rsd, RENDERER_STATE); + pan_merge((*rsd), empty_rsd, RENDERER_STATE); } static void -panfrost_prepare_fs_state(struct panfrost_context *ctx, - mali_ptr *blend_shaders, +panfrost_prepare_fs_state(struct panfrost_context *ctx, mali_ptr *blend_shaders, struct mali_renderer_state_packed *rsd) { - struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; - const struct panfrost_zsa_state *zsa = ctx->depth_stencil; - struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; - struct panfrost_blend_state *so = ctx->blend; - bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage; - bool msaa = rast->multisample; + struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + const struct panfrost_zsa_state *zsa = ctx->depth_stencil; + struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; + struct panfrost_blend_state *so = ctx->blend; + bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage; + bool msaa = rast->multisample; - unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs; + unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs; - bool has_blend_shader = false; + bool has_blend_shader = false; - for (unsigned c = 0; c < rt_count; ++c) - has_blend_shader |= (blend_shaders[c] != 0); + for (unsigned c = 0; c < rt_count; ++c) + has_blend_shader |= (blend_shaders[c] != 0); - bool has_oq = ctx->occlusion_query && ctx->active_queries; + bool has_oq = ctx->occlusion_query && ctx->active_queries; - pan_pack(rsd, RENDERER_STATE, cfg) { - if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) { + pan_pack(rsd, RENDERER_STATE, cfg) { + if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) { #if PAN_ARCH >= 6 - struct pan_earlyzs_state earlyzs = - pan_earlyzs_get(fs->earlyzs, - ctx->depth_stencil->writes_zs || - has_oq, - ctx->blend->base.alpha_to_coverage, - ctx->depth_stencil->zs_always_passes); + struct pan_earlyzs_state earlyzs = pan_earlyzs_get( + fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq, + ctx->blend->base.alpha_to_coverage, + ctx->depth_stencil->zs_always_passes); - cfg.properties.pixel_kill_operation = earlyzs.kill; - cfg.properties.zs_update_operation = earlyzs.update; + cfg.properties.pixel_kill_operation = earlyzs.kill; + cfg.properties.zs_update_operation = earlyzs.update; - cfg.properties.allow_forward_pixel_to_kill = - pan_allow_forward_pixel_to_kill(ctx, fs); + cfg.properties.allow_forward_pixel_to_kill = + pan_allow_forward_pixel_to_kill(ctx, fs); #else - cfg.properties.force_early_z = - fs->info.fs.can_early_z && !alpha_to_coverage && - ((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS); + cfg.properties.force_early_z = + fs->info.fs.can_early_z && !alpha_to_coverage && + ((enum mali_func)zsa->base.alpha_func == MALI_FUNC_ALWAYS); - /* TODO: Reduce this limit? */ - if (has_blend_shader) - cfg.properties.work_register_count = MAX2(fs->info.work_reg_count, 8); - else - cfg.properties.work_register_count = fs->info.work_reg_count; + /* TODO: Reduce this limit? */ + if (has_blend_shader) + cfg.properties.work_register_count = + MAX2(fs->info.work_reg_count, 8); + else + cfg.properties.work_register_count = fs->info.work_reg_count; - /* Hardware quirks around early-zs forcing without a - * depth buffer. Note this breaks occlusion queries. */ - bool force_ez_with_discard = !zsa->enabled && !has_oq; + /* Hardware quirks around early-zs forcing without a + * depth buffer. Note this breaks occlusion queries. */ + bool force_ez_with_discard = !zsa->enabled && !has_oq; - cfg.properties.shader_reads_tilebuffer = - force_ez_with_discard && fs->info.fs.can_discard; - cfg.properties.shader_contains_discard = - !force_ez_with_discard && fs->info.fs.can_discard; + cfg.properties.shader_reads_tilebuffer = + force_ez_with_discard && fs->info.fs.can_discard; + cfg.properties.shader_contains_discard = + !force_ez_with_discard && fs->info.fs.can_discard; #endif - } + } #if PAN_ARCH == 4 - if (rt_count > 0) { - cfg.multisample_misc.load_destination = so->info[0].load_dest; - cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0); - cfg.stencil_mask_misc.write_enable = so->info[0].enabled; - cfg.stencil_mask_misc.srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format); - cfg.stencil_mask_misc.dither_disable = !so->base.dither; - cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one; + if (rt_count > 0) { + cfg.multisample_misc.load_destination = so->info[0].load_dest; + cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0); + cfg.stencil_mask_misc.write_enable = so->info[0].enabled; + cfg.stencil_mask_misc.srgb = + util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format); + cfg.stencil_mask_misc.dither_disable = !so->base.dither; + cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one; - if (blend_shaders[0]) { - cfg.blend_shader = blend_shaders[0]; - } else { - cfg.blend_constant = pan_blend_get_constant( - so->info[0].constant_mask, - ctx->blend_color.color); - } - } else { - /* If there is no colour buffer, leaving fields default is - * fine, except for blending which is nonnullable */ - cfg.blend_equation.color_mask = 0xf; - cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; - cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; - cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; - cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; - cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; - cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; - } + if (blend_shaders[0]) { + cfg.blend_shader = blend_shaders[0]; + } else { + cfg.blend_constant = pan_blend_get_constant( + so->info[0].constant_mask, ctx->blend_color.color); + } + } else { + /* If there is no colour buffer, leaving fields default is + * fine, except for blending which is nonnullable */ + cfg.blend_equation.color_mask = 0xf; + cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; + cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; + cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; + cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; + cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; + cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; + } #elif PAN_ARCH == 5 - /* Workaround */ - cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count); + /* Workaround */ + cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count); #endif - cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF; + cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF; - cfg.multisample_misc.evaluate_per_sample = - msaa && (ctx->min_samples > 1); + cfg.multisample_misc.evaluate_per_sample = msaa && (ctx->min_samples > 1); #if PAN_ARCH >= 6 - /* MSAA blend shaders need to pass their sample ID to - * LD_TILE/ST_TILE, so we must preload it. Additionally, we - * need per-sample shading for the blend shader, accomplished - * by forcing per-sample shading for the whole program. */ + /* MSAA blend shaders need to pass their sample ID to + * LD_TILE/ST_TILE, so we must preload it. Additionally, we + * need per-sample shading for the blend shader, accomplished + * by forcing per-sample shading for the whole program. */ - if (msaa && has_blend_shader) { - cfg.multisample_misc.evaluate_per_sample = true; - cfg.preload.fragment.sample_mask_id = true; - } + if (msaa && has_blend_shader) { + cfg.multisample_misc.evaluate_per_sample = true; + cfg.preload.fragment.sample_mask_id = true; + } - /* Bifrost does not have native point sprites. Point sprites are - * lowered in the driver to gl_PointCoord reads. This field - * actually controls the orientation of gl_PointCoord. Both - * orientations are controlled with sprite_coord_mode in - * Gallium. - */ - cfg.properties.point_sprite_coord_origin_max_y = - (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT); + /* Bifrost does not have native point sprites. Point sprites are + * lowered in the driver to gl_PointCoord reads. This field + * actually controls the orientation of gl_PointCoord. Both + * orientations are controlled with sprite_coord_mode in + * Gallium. + */ + cfg.properties.point_sprite_coord_origin_max_y = + (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT); - cfg.multisample_misc.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0); - cfg.multisample_misc.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1); + cfg.multisample_misc.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0); + cfg.multisample_misc.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1); #endif - cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage; - cfg.depth_units = rast->offset_units * 2.0f; - cfg.depth_factor = rast->offset_scale; + cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage; + cfg.depth_units = rast->offset_units * 2.0f; + cfg.depth_factor = rast->offset_scale; - bool back_enab = zsa->base.stencil[1].enabled; - cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0]; - cfg.stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0]; + bool back_enab = zsa->base.stencil[1].enabled; + cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0]; + cfg.stencil_back.reference_value = + ctx->stencil_ref.ref_value[back_enab ? 1 : 0]; #if PAN_ARCH <= 5 - /* v6+ fits register preload here, no alpha testing */ - cfg.alpha_reference = zsa->base.alpha_ref_value; + /* v6+ fits register preload here, no alpha testing */ + cfg.alpha_reference = zsa->base.alpha_ref_value; #endif - } + } } static void @@ -677,153 +686,152 @@ panfrost_emit_frag_shader(struct panfrost_context *ctx, struct mali_renderer_state_packed *fragmeta, mali_ptr *blend_shaders) { - const struct panfrost_zsa_state *zsa = ctx->depth_stencil; - const struct panfrost_rasterizer *rast = ctx->rasterizer; - struct panfrost_compiled_shader *fs = - ctx->prog[PIPE_SHADER_FRAGMENT]; + const struct panfrost_zsa_state *zsa = ctx->depth_stencil; + const struct panfrost_rasterizer *rast = ctx->rasterizer; + struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; - /* We need to merge several several partial renderer state descriptors, - * so stage to temporary storage rather than reading back write-combine - * memory, which will trash performance. */ - struct mali_renderer_state_packed rsd; - panfrost_prepare_fs_state(ctx, blend_shaders, &rsd); + /* We need to merge several several partial renderer state descriptors, + * so stage to temporary storage rather than reading back write-combine + * memory, which will trash performance. */ + struct mali_renderer_state_packed rsd; + panfrost_prepare_fs_state(ctx, blend_shaders, &rsd); #if PAN_ARCH == 4 - if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) { - /* Word 14: SFBD Blend Equation */ - STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4); - rsd.opaque[14] = ctx->blend->equation[0]; - } + if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) { + /* Word 14: SFBD Blend Equation */ + STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4); + rsd.opaque[14] = ctx->blend->equation[0]; + } #endif - /* Merge with CSO state and upload */ - if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) { - struct mali_renderer_state_packed *partial_rsd = - (struct mali_renderer_state_packed *)&fs->partial_rsd; - STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd)); - pan_merge(rsd, *partial_rsd, RENDERER_STATE); - } else { - pan_merge_empty_fs(&rsd); - } + /* Merge with CSO state and upload */ + if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) { + struct mali_renderer_state_packed *partial_rsd = + (struct mali_renderer_state_packed *)&fs->partial_rsd; + STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd)); + pan_merge(rsd, *partial_rsd, RENDERER_STATE); + } else { + pan_merge_empty_fs(&rsd); + } - /* Word 8, 9 Misc state */ - rsd.opaque[8] |= zsa->rsd_depth.opaque[0] - | rast->multisample.opaque[0]; + /* Word 8, 9 Misc state */ + rsd.opaque[8] |= zsa->rsd_depth.opaque[0] | rast->multisample.opaque[0]; - rsd.opaque[9] |= zsa->rsd_stencil.opaque[0] - | rast->stencil_misc.opaque[0]; + rsd.opaque[9] |= zsa->rsd_stencil.opaque[0] | rast->stencil_misc.opaque[0]; - /* Word 10, 11 Stencil Front and Back */ - rsd.opaque[10] |= zsa->stencil_front.opaque[0]; - rsd.opaque[11] |= zsa->stencil_back.opaque[0]; + /* Word 10, 11 Stencil Front and Back */ + rsd.opaque[10] |= zsa->stencil_front.opaque[0]; + rsd.opaque[11] |= zsa->stencil_back.opaque[0]; - memcpy(fragmeta, &rsd, sizeof(rsd)); + memcpy(fragmeta, &rsd, sizeof(rsd)); } static mali_ptr panfrost_emit_frag_shader_meta(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_FRAGMENT]; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_FRAGMENT]; - panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT); + panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT); - struct panfrost_ptr xfer; + struct panfrost_ptr xfer; #if PAN_ARCH == 4 - xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE); + xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE); #else - unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1); + unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1); - xfer = pan_pool_alloc_desc_aggregate(&batch->pool.base, - PAN_DESC(RENDERER_STATE), - PAN_DESC_ARRAY(rt_count, BLEND)); + xfer = + pan_pool_alloc_desc_aggregate(&batch->pool.base, PAN_DESC(RENDERER_STATE), + PAN_DESC_ARRAY(rt_count, BLEND)); #endif - mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 }; - panfrost_get_blend_shaders(batch, blend_shaders); + mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = {0}; + panfrost_get_blend_shaders(batch, blend_shaders); - panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders); + panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *)xfer.cpu, + blend_shaders); #if PAN_ARCH >= 5 - panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE), blend_shaders); + panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE), + blend_shaders); #endif - return xfer.gpu; + return xfer.gpu; } #endif static mali_ptr panfrost_emit_viewport(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; - const struct pipe_viewport_state *vp = &ctx->pipe_viewport; - const struct pipe_scissor_state *ss = &ctx->scissor; - const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + struct panfrost_context *ctx = batch->ctx; + const struct pipe_viewport_state *vp = &ctx->pipe_viewport; + const struct pipe_scissor_state *ss = &ctx->scissor; + const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; - /* Derive min/max from translate/scale. Note since |x| >= 0 by - * definition, we have that -|x| <= |x| hence translate - |scale| <= - * translate + |scale|, so the ordering is correct here. */ - float vp_minx = vp->translate[0] - fabsf(vp->scale[0]); - float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]); - float vp_miny = vp->translate[1] - fabsf(vp->scale[1]); - float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]); + /* Derive min/max from translate/scale. Note since |x| >= 0 by + * definition, we have that -|x| <= |x| hence translate - |scale| <= + * translate + |scale|, so the ordering is correct here. */ + float vp_minx = vp->translate[0] - fabsf(vp->scale[0]); + float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]); + float vp_miny = vp->translate[1] - fabsf(vp->scale[1]); + float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]); - float minz, maxz; - util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz); + float minz, maxz; + util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz); - /* Scissor to the intersection of viewport and to the scissor, clamped - * to the framebuffer */ + /* Scissor to the intersection of viewport and to the scissor, clamped + * to the framebuffer */ - unsigned minx = MIN2(batch->key.width, MAX2((int) vp_minx, 0)); - unsigned maxx = MIN2(batch->key.width, MAX2((int) vp_maxx, 0)); - unsigned miny = MIN2(batch->key.height, MAX2((int) vp_miny, 0)); - unsigned maxy = MIN2(batch->key.height, MAX2((int) vp_maxy, 0)); + unsigned minx = MIN2(batch->key.width, MAX2((int)vp_minx, 0)); + unsigned maxx = MIN2(batch->key.width, MAX2((int)vp_maxx, 0)); + unsigned miny = MIN2(batch->key.height, MAX2((int)vp_miny, 0)); + unsigned maxy = MIN2(batch->key.height, MAX2((int)vp_maxy, 0)); - if (ss && rast->scissor) { - minx = MAX2(ss->minx, minx); - miny = MAX2(ss->miny, miny); - maxx = MIN2(ss->maxx, maxx); - maxy = MIN2(ss->maxy, maxy); - } + if (ss && rast->scissor) { + minx = MAX2(ss->minx, minx); + miny = MAX2(ss->miny, miny); + maxx = MIN2(ss->maxx, maxx); + maxy = MIN2(ss->maxy, maxy); + } - /* Set the range to [1, 1) so max values don't wrap round */ - if (maxx == 0 || maxy == 0) - maxx = maxy = minx = miny = 1; + /* Set the range to [1, 1) so max values don't wrap round */ + if (maxx == 0 || maxy == 0) + maxx = maxy = minx = miny = 1; - panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy); - batch->scissor_culls_everything = (minx >= maxx || miny >= maxy); + panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy); + batch->scissor_culls_everything = (minx >= maxx || miny >= maxy); - /* [minx, maxx) and [miny, maxy) are exclusive ranges in the hardware */ - maxx--; - maxy--; + /* [minx, maxx) and [miny, maxy) are exclusive ranges in the hardware */ + maxx--; + maxy--; - batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY; - batch->maximum_z = rast->depth_clip_far ? maxz : +INFINITY; + batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY; + batch->maximum_z = rast->depth_clip_far ? maxz : +INFINITY; #if PAN_ARCH <= 7 - struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT); + struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT); - pan_pack(T.cpu, VIEWPORT, cfg) { - cfg.scissor_minimum_x = minx; - cfg.scissor_minimum_y = miny; - cfg.scissor_maximum_x = maxx; - cfg.scissor_maximum_y = maxy; + pan_pack(T.cpu, VIEWPORT, cfg) { + cfg.scissor_minimum_x = minx; + cfg.scissor_minimum_y = miny; + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; - cfg.minimum_z = batch->minimum_z; - cfg.maximum_z = batch->maximum_z; - } + cfg.minimum_z = batch->minimum_z; + cfg.maximum_z = batch->maximum_z; + } - return T.gpu; + return T.gpu; #else - pan_pack(&batch->scissor, SCISSOR, cfg) { - cfg.scissor_minimum_x = minx; - cfg.scissor_minimum_y = miny; - cfg.scissor_maximum_x = maxx; - cfg.scissor_maximum_y = maxy; - } + pan_pack(&batch->scissor, SCISSOR, cfg) { + cfg.scissor_minimum_x = minx; + cfg.scissor_minimum_y = miny; + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; + } - return 0; + return 0; #endif } @@ -838,32 +846,33 @@ panfrost_emit_viewport(struct panfrost_batch *batch) static mali_ptr panfrost_emit_depth_stencil(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; - const struct panfrost_zsa_state *zsa = ctx->depth_stencil; - struct panfrost_rasterizer *rast = ctx->rasterizer; - struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; - bool back_enab = zsa->base.stencil[1].enabled; + struct panfrost_context *ctx = batch->ctx; + const struct panfrost_zsa_state *zsa = ctx->depth_stencil; + struct panfrost_rasterizer *rast = ctx->rasterizer; + struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; + bool back_enab = zsa->base.stencil[1].enabled; - struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, DEPTH_STENCIL); - struct mali_depth_stencil_packed dynamic; + struct panfrost_ptr T = + pan_pool_alloc_desc(&batch->pool.base, DEPTH_STENCIL); + struct mali_depth_stencil_packed dynamic; - pan_pack(&dynamic, DEPTH_STENCIL, cfg) { - cfg.front_reference_value = ctx->stencil_ref.ref_value[0]; - cfg.back_reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0]; + pan_pack(&dynamic, DEPTH_STENCIL, cfg) { + cfg.front_reference_value = ctx->stencil_ref.ref_value[0]; + cfg.back_reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0]; - cfg.stencil_from_shader = fs->info.fs.writes_stencil; - cfg.depth_source = pan_depth_source(&fs->info); + cfg.stencil_from_shader = fs->info.fs.writes_stencil; + cfg.depth_source = pan_depth_source(&fs->info); - cfg.depth_bias_enable = rast->base.offset_tri; - cfg.depth_units = rast->base.offset_units * 2.0f; - cfg.depth_factor = rast->base.offset_scale; - cfg.depth_bias_clamp = rast->base.offset_clamp; - } + cfg.depth_bias_enable = rast->base.offset_tri; + cfg.depth_units = rast->base.offset_units * 2.0f; + cfg.depth_factor = rast->base.offset_scale; + cfg.depth_bias_clamp = rast->base.offset_clamp; + } - pan_merge(dynamic, zsa->desc, DEPTH_STENCIL); - memcpy(T.cpu, &dynamic, pan_size(DEPTH_STENCIL)); + pan_merge(dynamic, zsa->desc, DEPTH_STENCIL); + memcpy(T.cpu, &dynamic, pan_size(DEPTH_STENCIL)); - return T.gpu; + return T.gpu; } /** @@ -873,24 +882,25 @@ panfrost_emit_depth_stencil(struct panfrost_batch *batch) static mali_ptr panfrost_emit_blend_valhall(struct panfrost_batch *batch) { - unsigned rt_count = MAX2(batch->key.nr_cbufs, 1); + unsigned rt_count = MAX2(batch->key.nr_cbufs, 1); - struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, rt_count, BLEND); + struct panfrost_ptr T = + pan_pool_alloc_desc_array(&batch->pool.base, rt_count, BLEND); - mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 }; - panfrost_get_blend_shaders(batch, blend_shaders); + mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = {0}; + panfrost_get_blend_shaders(batch, blend_shaders); - panfrost_emit_blend(batch, T.cpu, blend_shaders); + panfrost_emit_blend(batch, T.cpu, blend_shaders); - /* Precalculate for the per-draw path */ - bool has_blend_shader = false; + /* Precalculate for the per-draw path */ + bool has_blend_shader = false; - for (unsigned i = 0; i < rt_count; ++i) - has_blend_shader |= !!blend_shaders[i]; + for (unsigned i = 0; i < rt_count; ++i) + has_blend_shader |= !!blend_shaders[i]; - batch->ctx->valhall_has_blend_shader = has_blend_shader; + batch->ctx->valhall_has_blend_shader = has_blend_shader; - return T.gpu; + return T.gpu; } /** @@ -899,29 +909,28 @@ panfrost_emit_blend_valhall(struct panfrost_batch *batch) static mali_ptr panfrost_emit_vertex_buffers(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; - unsigned buffer_count = util_last_bit(ctx->vb_mask); - struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, - buffer_count, BUFFER); - struct mali_buffer_packed *buffers = T.cpu; + struct panfrost_context *ctx = batch->ctx; + unsigned buffer_count = util_last_bit(ctx->vb_mask); + struct panfrost_ptr T = + pan_pool_alloc_desc_array(&batch->pool.base, buffer_count, BUFFER); + struct mali_buffer_packed *buffers = T.cpu; - u_foreach_bit(i, ctx->vb_mask) { - struct pipe_vertex_buffer vb = ctx->vertex_buffers[i]; - struct pipe_resource *prsrc = vb.buffer.resource; - struct panfrost_resource *rsrc = pan_resource(prsrc); - assert(!vb.is_user_buffer); + u_foreach_bit(i, ctx->vb_mask) { + struct pipe_vertex_buffer vb = ctx->vertex_buffers[i]; + struct pipe_resource *prsrc = vb.buffer.resource; + struct panfrost_resource *rsrc = pan_resource(prsrc); + assert(!vb.is_user_buffer); - panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); + panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); - pan_pack(buffers + i, BUFFER, cfg) { - cfg.address = rsrc->image.data.bo->ptr.gpu + - vb.buffer_offset; + pan_pack(buffers + i, BUFFER, cfg) { + cfg.address = rsrc->image.data.bo->ptr.gpu + vb.buffer_offset; - cfg.size = prsrc->width0 - vb.buffer_offset; - } - } + cfg.size = prsrc->width0 - vb.buffer_offset; + } + } - return T.gpu; + return T.gpu; } /** @@ -933,26 +942,25 @@ panfrost_emit_vertex_buffers(struct panfrost_batch *batch) static mali_ptr panfrost_emit_vertex_data(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_vertex_state *vtx = ctx->vertex; - struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, - vtx->num_elements, - ATTRIBUTE); - struct mali_attribute_packed *attributes = T.cpu; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_vertex_state *vtx = ctx->vertex; + struct panfrost_ptr T = pan_pool_alloc_desc_array( + &batch->pool.base, vtx->num_elements, ATTRIBUTE); + struct mali_attribute_packed *attributes = T.cpu; - for (unsigned i = 0; i < vtx->num_elements; ++i) { - struct mali_attribute_packed packed; - unsigned vbi = vtx->pipe[i].vertex_buffer_index; + for (unsigned i = 0; i < vtx->num_elements; ++i) { + struct mali_attribute_packed packed; + unsigned vbi = vtx->pipe[i].vertex_buffer_index; - pan_pack(&packed, ATTRIBUTE, cfg) { - cfg.stride = ctx->vertex_buffers[vbi].stride; - } + pan_pack(&packed, ATTRIBUTE, cfg) { + cfg.stride = ctx->vertex_buffers[vbi].stride; + } - pan_merge(packed, vtx->attributes[i], ATTRIBUTE); - attributes[i] = packed; - } + pan_merge(packed, vtx->attributes[i], ATTRIBUTE); + attributes[i] = packed; + } - return T.gpu; + return T.gpu; } /* @@ -964,83 +972,79 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch) static struct pipe_sampler_view panfrost_pipe_image_to_sampler_view(struct pipe_image_view *v) { - struct pipe_sampler_view out = { - .format = v->format, - .texture = v->resource, - .target = v->resource->target, - .swizzle_r = PIPE_SWIZZLE_X, - .swizzle_g = PIPE_SWIZZLE_Y, - .swizzle_b = PIPE_SWIZZLE_Z, - .swizzle_a = PIPE_SWIZZLE_W - }; + struct pipe_sampler_view out = {.format = v->format, + .texture = v->resource, + .target = v->resource->target, + .swizzle_r = PIPE_SWIZZLE_X, + .swizzle_g = PIPE_SWIZZLE_Y, + .swizzle_b = PIPE_SWIZZLE_Z, + .swizzle_a = PIPE_SWIZZLE_W}; - if (out.target == PIPE_BUFFER) { - out.u.buf.offset = v->u.buf.offset; - out.u.buf.size = v->u.buf.size; - } else { - out.u.tex.first_layer = v->u.tex.first_layer; - out.u.tex.last_layer = v->u.tex.last_layer; + if (out.target == PIPE_BUFFER) { + out.u.buf.offset = v->u.buf.offset; + out.u.buf.size = v->u.buf.size; + } else { + out.u.tex.first_layer = v->u.tex.first_layer; + out.u.tex.last_layer = v->u.tex.last_layer; - /* Single level only */ - out.u.tex.first_level = v->u.tex.level; - out.u.tex.last_level = v->u.tex.level; - } + /* Single level only */ + out.u.tex.first_level = v->u.tex.level; + out.u.tex.last_level = v->u.tex.level; + } - return out; + return out; } -static void -panfrost_update_sampler_view(struct panfrost_sampler_view *view, - struct pipe_context *pctx); +static void panfrost_update_sampler_view(struct panfrost_sampler_view *view, + struct pipe_context *pctx); static mali_ptr panfrost_emit_images(struct panfrost_batch *batch, enum pipe_shader_type stage) { - struct panfrost_context *ctx = batch->ctx; - unsigned last_bit = util_last_bit(ctx->image_mask[stage]); + struct panfrost_context *ctx = batch->ctx; + unsigned last_bit = util_last_bit(ctx->image_mask[stage]); - struct panfrost_ptr T = - pan_pool_alloc_desc_array(&batch->pool.base, last_bit, TEXTURE); + struct panfrost_ptr T = + pan_pool_alloc_desc_array(&batch->pool.base, last_bit, TEXTURE); - struct mali_texture_packed *out = (struct mali_texture_packed *) T.cpu; + struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu; - for (int i = 0; i < last_bit; ++i) { - struct pipe_image_view *image = &ctx->images[stage][i]; + for (int i = 0; i < last_bit; ++i) { + struct pipe_image_view *image = &ctx->images[stage][i]; - if (!(ctx->image_mask[stage] & BITFIELD_BIT(i))) { - memset(&out[i], 0, sizeof(out[i])); - continue; - } + if (!(ctx->image_mask[stage] & BITFIELD_BIT(i))) { + memset(&out[i], 0, sizeof(out[i])); + continue; + } - /* Construct a synthetic sampler view so we can use our usual - * sampler view code for the actual descriptor packing. - * - * Use the batch pool for a transient allocation, rather than - * allocating a long-lived descriptor. - */ - struct panfrost_sampler_view view = { - .base = panfrost_pipe_image_to_sampler_view(image), - .pool = &batch->pool - }; + /* Construct a synthetic sampler view so we can use our usual + * sampler view code for the actual descriptor packing. + * + * Use the batch pool for a transient allocation, rather than + * allocating a long-lived descriptor. + */ + struct panfrost_sampler_view view = { + .base = panfrost_pipe_image_to_sampler_view(image), + .pool = &batch->pool}; - /* If we specify a cube map, the hardware internally treat it as - * a 2D array. Since cube maps as images can confuse our common - * texturing code, explicitly use a 2D array. - * - * Similar concerns apply to 3D textures. - */ - if (view.base.target == PIPE_BUFFER) - view.base.target = PIPE_BUFFER; - else - view.base.target = PIPE_TEXTURE_2D_ARRAY; + /* If we specify a cube map, the hardware internally treat it as + * a 2D array. Since cube maps as images can confuse our common + * texturing code, explicitly use a 2D array. + * + * Similar concerns apply to 3D textures. + */ + if (view.base.target == PIPE_BUFFER) + view.base.target = PIPE_BUFFER; + else + view.base.target = PIPE_TEXTURE_2D_ARRAY; - panfrost_update_sampler_view(&view, &ctx->base); - out[i] = view.bifrost_descriptor; + panfrost_update_sampler_view(&view, &ctx->base); + out[i] = view.bifrost_descriptor; - panfrost_track_image_access(batch, stage, image); - } + panfrost_track_image_access(batch, stage, image); + } - return T.gpu; + return T.gpu; } #endif @@ -1050,213 +1054,205 @@ panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch, struct panfrost_constant_buffer *buf, unsigned index) { - struct pipe_constant_buffer *cb = &buf->cb[index]; - struct panfrost_resource *rsrc = pan_resource(cb->buffer); + struct pipe_constant_buffer *cb = &buf->cb[index]; + struct panfrost_resource *rsrc = pan_resource(cb->buffer); - if (rsrc) { - panfrost_batch_read_rsrc(batch, rsrc, st); + if (rsrc) { + panfrost_batch_read_rsrc(batch, rsrc, st); - /* Alignment gauranteed by - * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */ - return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset; - } else if (cb->user_buffer) { - return pan_pool_upload_aligned(&batch->pool.base, - cb->user_buffer + - cb->buffer_offset, - cb->buffer_size, 16); - } else { - unreachable("No constant buffer"); - } + /* Alignment gauranteed by + * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */ + return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset; + } else if (cb->user_buffer) { + return pan_pool_upload_aligned(&batch->pool.base, + cb->user_buffer + cb->buffer_offset, + cb->buffer_size, 16); + } else { + unreachable("No constant buffer"); + } } struct sysval_uniform { - union { - float f[4]; - int32_t i[4]; - uint32_t u[4]; - uint64_t du[2]; - }; + union { + float f[4]; + int32_t i[4]; + uint32_t u[4]; + uint64_t du[2]; + }; }; static void panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; - const struct pipe_viewport_state *vp = &ctx->pipe_viewport; + struct panfrost_context *ctx = batch->ctx; + const struct pipe_viewport_state *vp = &ctx->pipe_viewport; - uniform->f[0] = vp->scale[0]; - uniform->f[1] = vp->scale[1]; - uniform->f[2] = vp->scale[2]; + uniform->f[0] = vp->scale[0]; + uniform->f[1] = vp->scale[1]; + uniform->f[2] = vp->scale[2]; } static void panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; - const struct pipe_viewport_state *vp = &ctx->pipe_viewport; + struct panfrost_context *ctx = batch->ctx; + const struct pipe_viewport_state *vp = &ctx->pipe_viewport; - uniform->f[0] = vp->translate[0]; - uniform->f[1] = vp->translate[1]; - uniform->f[2] = vp->translate[2]; + uniform->f[0] = vp->translate[0]; + uniform->f[1] = vp->translate[1]; + uniform->f[2] = vp->translate[2]; } -static void panfrost_upload_txs_sysval(struct panfrost_batch *batch, - enum pipe_shader_type st, - unsigned int sysvalid, - struct sysval_uniform *uniform) +static void +panfrost_upload_txs_sysval(struct panfrost_batch *batch, + enum pipe_shader_type st, unsigned int sysvalid, + struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; - unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid); - unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid); - bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid); - struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base; + struct panfrost_context *ctx = batch->ctx; + unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid); + unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid); + bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid); + struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base; - assert(dim); + assert(dim); - if (tex->target == PIPE_BUFFER) { - assert(dim == 1); - uniform->i[0] = - tex->u.buf.size / util_format_get_blocksize(tex->format); - return; - } + if (tex->target == PIPE_BUFFER) { + assert(dim == 1); + uniform->i[0] = tex->u.buf.size / util_format_get_blocksize(tex->format); + return; + } - uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level); + uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level); - if (dim > 1) - uniform->i[1] = u_minify(tex->texture->height0, - tex->u.tex.first_level); + if (dim > 1) + uniform->i[1] = u_minify(tex->texture->height0, tex->u.tex.first_level); - if (dim > 2) - uniform->i[2] = u_minify(tex->texture->depth0, - tex->u.tex.first_level); + if (dim > 2) + uniform->i[2] = u_minify(tex->texture->depth0, tex->u.tex.first_level); - if (is_array) { - unsigned size = tex->texture->array_size; + if (is_array) { + unsigned size = tex->texture->array_size; - /* Internally, we store the number of 2D images (faces * array - * size). Externally, we report the array size in terms of - * complete cubes. So divide by the # of faces per cube. - */ - if (tex->target == PIPE_TEXTURE_CUBE_ARRAY) - size /= 6; + /* Internally, we store the number of 2D images (faces * array + * size). Externally, we report the array size in terms of + * complete cubes. So divide by the # of faces per cube. + */ + if (tex->target == PIPE_TEXTURE_CUBE_ARRAY) + size /= 6; - uniform->i[dim] = size; - } + uniform->i[dim] = size; + } } -static void panfrost_upload_image_size_sysval(struct panfrost_batch *batch, - enum pipe_shader_type st, - unsigned int sysvalid, - struct sysval_uniform *uniform) +static void +panfrost_upload_image_size_sysval(struct panfrost_batch *batch, + enum pipe_shader_type st, + unsigned int sysvalid, + struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; - unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid); - unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid); - unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid); + struct panfrost_context *ctx = batch->ctx; + unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid); + unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid); + unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid); - assert(dim && dim < 4); + assert(dim && dim < 4); - struct pipe_image_view *image = &ctx->images[st][idx]; + struct pipe_image_view *image = &ctx->images[st][idx]; - if (image->resource->target == PIPE_BUFFER) { - unsigned blocksize = util_format_get_blocksize(image->format); - uniform->i[0] = image->resource->width0 / blocksize; - return; - } + if (image->resource->target == PIPE_BUFFER) { + unsigned blocksize = util_format_get_blocksize(image->format); + uniform->i[0] = image->resource->width0 / blocksize; + return; + } - uniform->i[0] = u_minify(image->resource->width0, - image->u.tex.level); + uniform->i[0] = u_minify(image->resource->width0, image->u.tex.level); - if (dim > 1) - uniform->i[1] = u_minify(image->resource->height0, - image->u.tex.level); + if (dim > 1) + uniform->i[1] = u_minify(image->resource->height0, image->u.tex.level); - if (dim > 2) - uniform->i[2] = u_minify(image->resource->depth0, - image->u.tex.level); + if (dim > 2) + uniform->i[2] = u_minify(image->resource->depth0, image->u.tex.level); - if (is_array) - uniform->i[dim] = image->resource->array_size; + if (is_array) + uniform->i[dim] = image->resource->array_size; } static void panfrost_upload_ssbo_sysval(struct panfrost_batch *batch, - enum pipe_shader_type st, - unsigned ssbo_id, + enum pipe_shader_type st, unsigned ssbo_id, struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - assert(ctx->ssbo_mask[st] & (1 << ssbo_id)); - struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id]; + assert(ctx->ssbo_mask[st] & (1 << ssbo_id)); + struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id]; - /* Compute address */ - struct panfrost_resource *rsrc = pan_resource(sb.buffer); - struct panfrost_bo *bo = rsrc->image.data.bo; + /* Compute address */ + struct panfrost_resource *rsrc = pan_resource(sb.buffer); + struct panfrost_bo *bo = rsrc->image.data.bo; - panfrost_batch_write_rsrc(batch, rsrc, st); + panfrost_batch_write_rsrc(batch, rsrc, st); - util_range_add(&rsrc->base, &rsrc->valid_buffer_range, - sb.buffer_offset, sb.buffer_size); + util_range_add(&rsrc->base, &rsrc->valid_buffer_range, sb.buffer_offset, + sb.buffer_size); - /* Upload address and size as sysval */ - uniform->du[0] = bo->ptr.gpu + sb.buffer_offset; - uniform->u[2] = sb.buffer_size; + /* Upload address and size as sysval */ + uniform->du[0] = bo->ptr.gpu + sb.buffer_offset; + uniform->u[2] = sb.buffer_size; } static void panfrost_upload_sampler_sysval(struct panfrost_batch *batch, - enum pipe_shader_type st, - unsigned samp_idx, + enum pipe_shader_type st, unsigned samp_idx, struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; - struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base; + struct panfrost_context *ctx = batch->ctx; + struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base; - uniform->f[0] = sampl->min_lod; - uniform->f[1] = sampl->max_lod; - uniform->f[2] = sampl->lod_bias; + uniform->f[0] = sampl->min_lod; + uniform->f[1] = sampl->max_lod; + uniform->f[2] = sampl->lod_bias; - /* Even without any errata, Midgard represents "no mipmapping" as - * fixing the LOD with the clamps; keep behaviour consistent. c.f. - * panfrost_create_sampler_state which also explains our choice of - * epsilon value (again to keep behaviour consistent) */ + /* Even without any errata, Midgard represents "no mipmapping" as + * fixing the LOD with the clamps; keep behaviour consistent. c.f. + * panfrost_create_sampler_state which also explains our choice of + * epsilon value (again to keep behaviour consistent) */ - if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) - uniform->f[1] = uniform->f[0] + (1.0/256.0); + if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) + uniform->f[1] = uniform->f[0] + (1.0 / 256.0); } static void panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - uniform->u[0] = ctx->compute_grid->grid[0]; - uniform->u[1] = ctx->compute_grid->grid[1]; - uniform->u[2] = ctx->compute_grid->grid[2]; + uniform->u[0] = ctx->compute_grid->grid[0]; + uniform->u[1] = ctx->compute_grid->grid[1]; + uniform->u[2] = ctx->compute_grid->grid[2]; } static void panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - uniform->u[0] = ctx->compute_grid->block[0]; - uniform->u[1] = ctx->compute_grid->block[1]; - uniform->u[2] = ctx->compute_grid->block[2]; + uniform->u[0] = ctx->compute_grid->block[0]; + uniform->u[1] = ctx->compute_grid->block[1]; + uniform->u[2] = ctx->compute_grid->block[2]; } static void panfrost_upload_work_dim_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - uniform->u[0] = ctx->compute_grid->work_dim; + uniform->u[0] = ctx->compute_grid->work_dim; } /* Sample positions are pushed in a Bifrost specific format on Bifrost. On @@ -1265,168 +1261,156 @@ panfrost_upload_work_dim_sysval(struct panfrost_batch *batch, static void panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch, - struct sysval_uniform *uniform) + struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_context *ctx = batch->ctx; + struct panfrost_device *dev = pan_device(ctx->base.screen); - unsigned samples = util_framebuffer_get_num_samples(&batch->key); - uniform->du[0] = panfrost_sample_positions(dev, panfrost_sample_pattern(samples)); + unsigned samples = util_framebuffer_get_num_samples(&batch->key); + uniform->du[0] = + panfrost_sample_positions(dev, panfrost_sample_pattern(samples)); } static void panfrost_upload_multisampled_sysval(struct panfrost_batch *batch, - struct sysval_uniform *uniform) + struct sysval_uniform *uniform) { - unsigned samples = util_framebuffer_get_num_samples(&batch->key); - uniform->u[0] = samples > 1; + unsigned samples = util_framebuffer_get_num_samples(&batch->key); + uniform->u[0] = samples > 1; } #if PAN_ARCH >= 6 static void panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch, - unsigned size_and_rt, struct sysval_uniform *uniform) + unsigned size_and_rt, + struct sysval_uniform *uniform) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_device *dev = pan_device(ctx->base.screen); - unsigned rt = size_and_rt & 0xF; - unsigned size = size_and_rt >> 4; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_device *dev = pan_device(ctx->base.screen); + unsigned rt = size_and_rt & 0xF; + unsigned size = size_and_rt >> 4; - if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) { - enum pipe_format format = batch->key.cbufs[rt]->format; - uniform->u[0] = - GENX(pan_blend_get_internal_desc)(dev, format, rt, size, false) >> 32; - } else { - pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg) - cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw; - } + if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) { + enum pipe_format format = batch->key.cbufs[rt]->format; + uniform->u[0] = + GENX(pan_blend_get_internal_desc)(dev, format, rt, size, false) >> 32; + } else { + pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg) + cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw; + } } #endif static unsigned panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target) { - return target->buffer_offset + (pan_so_target(target)->offset * stride); + return target->buffer_offset + (pan_so_target(target)->offset * stride); } static void -panfrost_upload_sysvals(struct panfrost_batch *batch, - void *ptr_cpu, - mali_ptr ptr_gpu, - struct panfrost_compiled_shader *ss, +panfrost_upload_sysvals(struct panfrost_batch *batch, void *ptr_cpu, + mali_ptr ptr_gpu, struct panfrost_compiled_shader *ss, enum pipe_shader_type st) { - struct sysval_uniform *uniforms = ptr_cpu; + struct sysval_uniform *uniforms = ptr_cpu; - for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) { - int sysval = ss->info.sysvals.sysvals[i]; + for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) { + int sysval = ss->info.sysvals.sysvals[i]; - switch (PAN_SYSVAL_TYPE(sysval)) { - case PAN_SYSVAL_VIEWPORT_SCALE: - panfrost_upload_viewport_scale_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_VIEWPORT_OFFSET: - panfrost_upload_viewport_offset_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_TEXTURE_SIZE: - panfrost_upload_txs_sysval(batch, st, - PAN_SYSVAL_ID(sysval), - &uniforms[i]); - break; - case PAN_SYSVAL_SSBO: - panfrost_upload_ssbo_sysval(batch, st, - PAN_SYSVAL_ID(sysval), - &uniforms[i]); - break; + switch (PAN_SYSVAL_TYPE(sysval)) { + case PAN_SYSVAL_VIEWPORT_SCALE: + panfrost_upload_viewport_scale_sysval(batch, &uniforms[i]); + break; + case PAN_SYSVAL_VIEWPORT_OFFSET: + panfrost_upload_viewport_offset_sysval(batch, &uniforms[i]); + break; + case PAN_SYSVAL_TEXTURE_SIZE: + panfrost_upload_txs_sysval(batch, st, PAN_SYSVAL_ID(sysval), + &uniforms[i]); + break; + case PAN_SYSVAL_SSBO: + panfrost_upload_ssbo_sysval(batch, st, PAN_SYSVAL_ID(sysval), + &uniforms[i]); + break; - case PAN_SYSVAL_XFB: - { - unsigned buf = PAN_SYSVAL_ID(sysval); - struct panfrost_compiled_shader *vs = - batch->ctx->prog[PIPE_SHADER_VERTEX]; - struct pipe_stream_output_info *so = &vs->stream_output; - unsigned stride = so->stride[buf] * 4; + case PAN_SYSVAL_XFB: { + unsigned buf = PAN_SYSVAL_ID(sysval); + struct panfrost_compiled_shader *vs = + batch->ctx->prog[PIPE_SHADER_VERTEX]; + struct pipe_stream_output_info *so = &vs->stream_output; + unsigned stride = so->stride[buf] * 4; - struct pipe_stream_output_target *target = NULL; - if (buf < batch->ctx->streamout.num_targets) - target = batch->ctx->streamout.targets[buf]; + struct pipe_stream_output_target *target = NULL; + if (buf < batch->ctx->streamout.num_targets) + target = batch->ctx->streamout.targets[buf]; - if (!target) { - /* Memory sink */ - uniforms[i].du[0] = 0x8ull << 60; - break; - } + if (!target) { + /* Memory sink */ + uniforms[i].du[0] = 0x8ull << 60; + break; + } - struct panfrost_resource *rsrc = pan_resource(target->buffer); - unsigned offset = panfrost_xfb_offset(stride, target); + struct panfrost_resource *rsrc = pan_resource(target->buffer); + unsigned offset = panfrost_xfb_offset(stride, target); - util_range_add(&rsrc->base, &rsrc->valid_buffer_range, - offset, target->buffer_size - offset); + util_range_add(&rsrc->base, &rsrc->valid_buffer_range, offset, + target->buffer_size - offset); - panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); + panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); - uniforms[i].du[0] = rsrc->image.data.bo->ptr.gpu + offset; - break; - } + uniforms[i].du[0] = rsrc->image.data.bo->ptr.gpu + offset; + break; + } - case PAN_SYSVAL_NUM_VERTICES: - uniforms[i].u[0] = batch->ctx->vertex_count; - break; + case PAN_SYSVAL_NUM_VERTICES: + uniforms[i].u[0] = batch->ctx->vertex_count; + break; - case PAN_SYSVAL_NUM_WORK_GROUPS: - for (unsigned j = 0; j < 3; j++) { - batch->num_wg_sysval[j] = - ptr_gpu + (i * sizeof(*uniforms)) + (j * 4); - } - panfrost_upload_num_work_groups_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_LOCAL_GROUP_SIZE: - panfrost_upload_local_group_size_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_WORK_DIM: - panfrost_upload_work_dim_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_SAMPLER: - panfrost_upload_sampler_sysval(batch, st, - PAN_SYSVAL_ID(sysval), - &uniforms[i]); - break; - case PAN_SYSVAL_IMAGE_SIZE: - panfrost_upload_image_size_sysval(batch, st, - PAN_SYSVAL_ID(sysval), - &uniforms[i]); - break; - case PAN_SYSVAL_SAMPLE_POSITIONS: - panfrost_upload_sample_positions_sysval(batch, - &uniforms[i]); - break; - case PAN_SYSVAL_MULTISAMPLED: - panfrost_upload_multisampled_sysval(batch, - &uniforms[i]); - break; + case PAN_SYSVAL_NUM_WORK_GROUPS: + for (unsigned j = 0; j < 3; j++) { + batch->num_wg_sysval[j] = + ptr_gpu + (i * sizeof(*uniforms)) + (j * 4); + } + panfrost_upload_num_work_groups_sysval(batch, &uniforms[i]); + break; + case PAN_SYSVAL_LOCAL_GROUP_SIZE: + panfrost_upload_local_group_size_sysval(batch, &uniforms[i]); + break; + case PAN_SYSVAL_WORK_DIM: + panfrost_upload_work_dim_sysval(batch, &uniforms[i]); + break; + case PAN_SYSVAL_SAMPLER: + panfrost_upload_sampler_sysval(batch, st, PAN_SYSVAL_ID(sysval), + &uniforms[i]); + break; + case PAN_SYSVAL_IMAGE_SIZE: + panfrost_upload_image_size_sysval(batch, st, PAN_SYSVAL_ID(sysval), + &uniforms[i]); + break; + case PAN_SYSVAL_SAMPLE_POSITIONS: + panfrost_upload_sample_positions_sysval(batch, &uniforms[i]); + break; + case PAN_SYSVAL_MULTISAMPLED: + panfrost_upload_multisampled_sysval(batch, &uniforms[i]); + break; #if PAN_ARCH >= 6 - case PAN_SYSVAL_RT_CONVERSION: - panfrost_upload_rt_conversion_sysval(batch, - PAN_SYSVAL_ID(sysval), &uniforms[i]); - break; + case PAN_SYSVAL_RT_CONVERSION: + panfrost_upload_rt_conversion_sysval(batch, PAN_SYSVAL_ID(sysval), + &uniforms[i]); + break; #endif - case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: - uniforms[i].u[0] = batch->ctx->offset_start; - uniforms[i].u[1] = batch->ctx->base_vertex; - uniforms[i].u[2] = batch->ctx->base_instance; - break; - case PAN_SYSVAL_DRAWID: - uniforms[i].u[0] = batch->ctx->drawid; - break; - default: - assert(0); - } - } + case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: + uniforms[i].u[0] = batch->ctx->offset_start; + uniforms[i].u[1] = batch->ctx->base_vertex; + uniforms[i].u[2] = batch->ctx->base_instance; + break; + case PAN_SYSVAL_DRAWID: + uniforms[i].u[0] = batch->ctx->drawid; + break; + default: + assert(0); + } + } } static const void * @@ -1434,19 +1418,19 @@ panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx, struct panfrost_constant_buffer *buf, unsigned index) { - struct pipe_constant_buffer *cb = &buf->cb[index]; - struct panfrost_resource *rsrc = pan_resource(cb->buffer); + struct pipe_constant_buffer *cb = &buf->cb[index]; + struct panfrost_resource *rsrc = pan_resource(cb->buffer); - if (rsrc) { - panfrost_bo_mmap(rsrc->image.data.bo); - panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping"); - panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false); + if (rsrc) { + panfrost_bo_mmap(rsrc->image.data.bo); + panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping"); + panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false); - return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset; - } else if (cb->user_buffer) { - return cb->user_buffer + cb->buffer_offset; - } else - unreachable("No constant buffer"); + return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset; + } else if (cb->user_buffer) { + return cb->user_buffer + cb->buffer_offset; + } else + unreachable("No constant buffer"); } /* Emit a single UBO record. On Valhall, UBOs are dumb buffers and are @@ -1458,125 +1442,121 @@ static void panfrost_emit_ubo(void *base, unsigned index, mali_ptr address, size_t size) { #if PAN_ARCH >= 9 - struct mali_buffer_packed *out = base; + struct mali_buffer_packed *out = base; - pan_pack(out + index, BUFFER, cfg) { - cfg.size = size; - cfg.address = address; - } + pan_pack(out + index, BUFFER, cfg) { + cfg.size = size; + cfg.address = address; + } #else - struct mali_uniform_buffer_packed *out = base; + struct mali_uniform_buffer_packed *out = base; - /* Issue (57) for the ARB_uniform_buffer_object spec says that - * the buffer can be larger than the uniform data inside it, - * so clamp ubo size to what hardware supports. */ + /* Issue (57) for the ARB_uniform_buffer_object spec says that + * the buffer can be larger than the uniform data inside it, + * so clamp ubo size to what hardware supports. */ - pan_pack(out + index, UNIFORM_BUFFER, cfg) { - cfg.entries = MIN2(DIV_ROUND_UP(size, 16), 1 << 12); - cfg.pointer = address; - } + pan_pack(out + index, UNIFORM_BUFFER, cfg) { + cfg.entries = MIN2(DIV_ROUND_UP(size, 16), 1 << 12); + cfg.pointer = address; + } #endif } static mali_ptr panfrost_emit_const_buf(struct panfrost_batch *batch, - enum pipe_shader_type stage, - unsigned *buffer_count, - mali_ptr *push_constants, - unsigned *pushed_words) + enum pipe_shader_type stage, unsigned *buffer_count, + mali_ptr *push_constants, unsigned *pushed_words) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage]; - struct panfrost_compiled_shader *ss = ctx->prog[stage]; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage]; + struct panfrost_compiled_shader *ss = ctx->prog[stage]; - if (!ss) - return 0; + if (!ss) + return 0; - /* Allocate room for the sysval and the uniforms */ - size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count; - struct panfrost_ptr transfer = - pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16); + /* Allocate room for the sysval and the uniforms */ + size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count; + struct panfrost_ptr transfer = + pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16); - /* Upload sysvals requested by the shader */ - uint8_t *sysvals = alloca(sys_size); - panfrost_upload_sysvals(batch, sysvals, transfer.gpu, ss, stage); - memcpy(transfer.cpu, sysvals, sys_size); + /* Upload sysvals requested by the shader */ + uint8_t *sysvals = alloca(sys_size); + panfrost_upload_sysvals(batch, sysvals, transfer.gpu, ss, stage); + memcpy(transfer.cpu, sysvals, sys_size); - /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */ - struct panfrost_compiled_shader *shader = ctx->prog[stage]; - unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0); - unsigned sysval_ubo = sys_size ? ubo_count : ~0; - struct panfrost_ptr ubos = { 0 }; + /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */ + struct panfrost_compiled_shader *shader = ctx->prog[stage]; + unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0); + unsigned sysval_ubo = sys_size ? ubo_count : ~0; + struct panfrost_ptr ubos = {0}; #if PAN_ARCH >= 9 - ubos = pan_pool_alloc_desc_array(&batch->pool.base, - ubo_count + 1, - BUFFER); + ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1, BUFFER); #else - ubos = pan_pool_alloc_desc_array(&batch->pool.base, - ubo_count + 1, - UNIFORM_BUFFER); + ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1, + UNIFORM_BUFFER); #endif - if (buffer_count) - *buffer_count = ubo_count + (sys_size ? 1 : 0); + if (buffer_count) + *buffer_count = ubo_count + (sys_size ? 1 : 0); - /* Upload sysval as a final UBO */ + /* Upload sysval as a final UBO */ - if (sys_size) - panfrost_emit_ubo(ubos.cpu, ubo_count, transfer.gpu, sys_size); + if (sys_size) + panfrost_emit_ubo(ubos.cpu, ubo_count, transfer.gpu, sys_size); - /* The rest are honest-to-goodness UBOs */ + /* The rest are honest-to-goodness UBOs */ - u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) { - size_t usz = buf->cb[ubo].buffer_size; - mali_ptr address = 0; + u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) { + size_t usz = buf->cb[ubo].buffer_size; + mali_ptr address = 0; - if (usz > 0) { - address = panfrost_map_constant_buffer_gpu(batch, - stage, buf, ubo); - } + if (usz > 0) { + address = panfrost_map_constant_buffer_gpu(batch, stage, buf, ubo); + } - panfrost_emit_ubo(ubos.cpu, ubo, address, usz); - } + panfrost_emit_ubo(ubos.cpu, ubo, address, usz); + } - if (pushed_words) - *pushed_words = ss->info.push.count; + if (pushed_words) + *pushed_words = ss->info.push.count; - if (ss->info.push.count == 0) - return ubos.gpu; + if (ss->info.push.count == 0) + return ubos.gpu; - /* Copy push constants required by the shader */ - struct panfrost_ptr push_transfer = - pan_pool_alloc_aligned(&batch->pool.base, - ss->info.push.count * 4, 16); + /* Copy push constants required by the shader */ + struct panfrost_ptr push_transfer = + pan_pool_alloc_aligned(&batch->pool.base, ss->info.push.count * 4, 16); - uint32_t *push_cpu = (uint32_t *) push_transfer.cpu; - *push_constants = push_transfer.gpu; + uint32_t *push_cpu = (uint32_t *)push_transfer.cpu; + *push_constants = push_transfer.gpu; - for (unsigned i = 0; i < ss->info.push.count; ++i) { - struct panfrost_ubo_word src = ss->info.push.words[i]; + for (unsigned i = 0; i < ss->info.push.count; ++i) { + struct panfrost_ubo_word src = ss->info.push.words[i]; - if (src.ubo == sysval_ubo) { - unsigned sysval_idx = src.offset / 16; - unsigned sysval_comp = (src.offset % 16) / 4; - unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]); - mali_ptr ptr = push_transfer.gpu + (4 * i); + if (src.ubo == sysval_ubo) { + unsigned sysval_idx = src.offset / 16; + unsigned sysval_comp = (src.offset % 16) / 4; + unsigned sysval_type = + PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]); + mali_ptr ptr = push_transfer.gpu + (4 * i); - if (sysval_type == PAN_SYSVAL_NUM_WORK_GROUPS) - batch->num_wg_sysval[sysval_comp] = ptr; - } - /* Map the UBO, this should be cheap. For some buffers this may - * read from write-combine memory which is slow, though :-( - */ - const void *mapped_ubo = (src.ubo == sysval_ubo) ? sysvals : - panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo); + if (sysval_type == PAN_SYSVAL_NUM_WORK_GROUPS) + batch->num_wg_sysval[sysval_comp] = ptr; + } + /* Map the UBO, this should be cheap. For some buffers this may + * read from write-combine memory which is slow, though :-( + */ + const void *mapped_ubo = + (src.ubo == sysval_ubo) + ? sysvals + : panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo); - /* TODO: Is there any benefit to combining ranges */ - memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4); - } + /* TODO: Is there any benefit to combining ranges */ + memcpy(push_cpu + i, (uint8_t *)mapped_ubo + src.offset, 4); + } - return ubos.gpu; + return ubos.gpu; } /* @@ -1592,71 +1572,66 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, static unsigned panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid) { - if (grid->indirect) { - /* May need tuning in the future, conservative guess */ - return 128; - } else { - return util_next_power_of_two(grid->grid[0]) * - util_next_power_of_two(grid->grid[1]) * - util_next_power_of_two(grid->grid[2]); - } + if (grid->indirect) { + /* May need tuning in the future, conservative guess */ + return 128; + } else { + return util_next_power_of_two(grid->grid[0]) * + util_next_power_of_two(grid->grid[1]) * + util_next_power_of_two(grid->grid[2]); + } } static mali_ptr panfrost_emit_shared_memory(struct panfrost_batch *batch, const struct pipe_grid_info *grid) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_device *dev = pan_device(ctx->base.screen); - struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_COMPUTE]; - struct panfrost_ptr t = - pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); + struct panfrost_context *ctx = batch->ctx; + struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_COMPUTE]; + struct panfrost_ptr t = + pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); - struct pan_tls_info info = { - .tls.size = ss->info.tls_size, - .wls.size = ss->info.wls_size + grid->variable_shared_mem, - .wls.instances = panfrost_choose_wls_instance_count(grid), - }; + struct pan_tls_info info = { + .tls.size = ss->info.tls_size, + .wls.size = ss->info.wls_size + grid->variable_shared_mem, + .wls.instances = panfrost_choose_wls_instance_count(grid), + }; - if (ss->info.tls_size) { - struct panfrost_bo *bo = - panfrost_batch_get_scratchpad(batch, - ss->info.tls_size, - dev->thread_tls_alloc, - dev->core_id_range); - info.tls.ptr = bo->ptr.gpu; - } + if (ss->info.tls_size) { + struct panfrost_bo *bo = panfrost_batch_get_scratchpad( + batch, ss->info.tls_size, dev->thread_tls_alloc, dev->core_id_range); + info.tls.ptr = bo->ptr.gpu; + } - if (ss->info.wls_size) { - unsigned size = pan_wls_adjust_size(info.wls.size) * - info.wls.instances * dev->core_id_range; + if (ss->info.wls_size) { + unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances * + dev->core_id_range; - struct panfrost_bo *bo = - panfrost_batch_get_shared_memory(batch, size, 1); + struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1); - info.wls.ptr = bo->ptr.gpu; - } + info.wls.ptr = bo->ptr.gpu; + } - GENX(pan_emit_tls)(&info, t.cpu); - return t.gpu; + GENX(pan_emit_tls)(&info, t.cpu); + return t.gpu; } #if PAN_ARCH <= 5 static mali_ptr -panfrost_get_tex_desc(struct panfrost_batch *batch, - enum pipe_shader_type st, +panfrost_get_tex_desc(struct panfrost_batch *batch, enum pipe_shader_type st, struct panfrost_sampler_view *view) { - if (!view) - return (mali_ptr) 0; + if (!view) + return (mali_ptr)0; - struct pipe_sampler_view *pview = &view->base; - struct panfrost_resource *rsrc = pan_resource(pview->texture); + struct pipe_sampler_view *pview = &view->base; + struct panfrost_resource *rsrc = pan_resource(pview->texture); - panfrost_batch_read_rsrc(batch, rsrc, st); - panfrost_batch_add_bo(batch, view->state.bo, st); + panfrost_batch_read_rsrc(batch, rsrc, st); + panfrost_batch_add_bo(batch, view->state.bo, st); - return view->state.gpu; + return view->state.gpu; } #endif @@ -1665,155 +1640,150 @@ panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so, struct pipe_context *pctx, struct pipe_resource *texture) { - struct panfrost_device *device = pan_device(pctx->screen); - struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_resource *prsrc = (struct panfrost_resource *)texture; - enum pipe_format format = so->base.format; - assert(prsrc->image.data.bo); + struct panfrost_device *device = pan_device(pctx->screen); + struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_resource *prsrc = (struct panfrost_resource *)texture; + enum pipe_format format = so->base.format; + assert(prsrc->image.data.bo); - /* Format to access the stencil/depth portion of a Z32_S8 texture */ - if (format == PIPE_FORMAT_X32_S8X24_UINT) { - assert(prsrc->separate_stencil); - texture = &prsrc->separate_stencil->base; - prsrc = (struct panfrost_resource *)texture; - format = texture->format; - } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { - format = PIPE_FORMAT_Z32_FLOAT; - } + /* Format to access the stencil/depth portion of a Z32_S8 texture */ + if (format == PIPE_FORMAT_X32_S8X24_UINT) { + assert(prsrc->separate_stencil); + texture = &prsrc->separate_stencil->base; + prsrc = (struct panfrost_resource *)texture; + format = texture->format; + } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { + format = PIPE_FORMAT_Z32_FLOAT; + } - so->texture_bo = prsrc->image.data.bo->ptr.gpu; - so->modifier = prsrc->image.layout.modifier; + so->texture_bo = prsrc->image.data.bo->ptr.gpu; + so->modifier = prsrc->image.layout.modifier; - /* MSAA only supported for 2D textures */ + /* MSAA only supported for 2D textures */ - assert(texture->nr_samples <= 1 || - so->base.target == PIPE_TEXTURE_2D || - so->base.target == PIPE_TEXTURE_2D_ARRAY); + assert(texture->nr_samples <= 1 || so->base.target == PIPE_TEXTURE_2D || + so->base.target == PIPE_TEXTURE_2D_ARRAY); - enum mali_texture_dimension type = - panfrost_translate_texture_dimension(so->base.target); + enum mali_texture_dimension type = + panfrost_translate_texture_dimension(so->base.target); - bool is_buffer = (so->base.target == PIPE_BUFFER); + bool is_buffer = (so->base.target == PIPE_BUFFER); - unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level; - unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level; - unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer; - unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer; - unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0; - unsigned buf_size = (is_buffer ? so->base.u.buf.size : 0) / - util_format_get_blocksize(format); + unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level; + unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level; + unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer; + unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer; + unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0; + unsigned buf_size = + (is_buffer ? so->base.u.buf.size : 0) / util_format_get_blocksize(format); - if (so->base.target == PIPE_TEXTURE_3D) { - first_layer /= prsrc->image.layout.depth; - last_layer /= prsrc->image.layout.depth; - assert(!first_layer && !last_layer); - } + if (so->base.target == PIPE_TEXTURE_3D) { + first_layer /= prsrc->image.layout.depth; + last_layer /= prsrc->image.layout.depth; + assert(!first_layer && !last_layer); + } - struct pan_image_view iview = { - .format = format, - .dim = type, - .first_level = first_level, - .last_level = last_level, - .first_layer = first_layer, - .last_layer = last_layer, - .swizzle = { - so->base.swizzle_r, - so->base.swizzle_g, - so->base.swizzle_b, - so->base.swizzle_a, - }, - .image = &prsrc->image, + struct pan_image_view iview = { + .format = format, + .dim = type, + .first_level = first_level, + .last_level = last_level, + .first_layer = first_layer, + .last_layer = last_layer, + .swizzle = + { + so->base.swizzle_r, + so->base.swizzle_g, + so->base.swizzle_b, + so->base.swizzle_a, + }, + .image = &prsrc->image, - .buf.offset = buf_offset, - .buf.size = buf_size, - }; + .buf.offset = buf_offset, + .buf.size = buf_size, + }; - unsigned size = - (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) + - GENX(panfrost_estimate_texture_payload_size)(&iview); + unsigned size = (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) + + GENX(panfrost_estimate_texture_payload_size)(&iview); - struct panfrost_pool *pool = so->pool ?: &ctx->descs; - struct panfrost_ptr payload = pan_pool_alloc_aligned(&pool->base, size, 64); - so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu); + struct panfrost_pool *pool = so->pool ?: &ctx->descs; + struct panfrost_ptr payload = pan_pool_alloc_aligned(&pool->base, size, 64); + so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu); - void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu; + void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu; - if (PAN_ARCH <= 5) { - payload.cpu += pan_size(TEXTURE); - payload.gpu += pan_size(TEXTURE); - } + if (PAN_ARCH <= 5) { + payload.cpu += pan_size(TEXTURE); + payload.gpu += pan_size(TEXTURE); + } - GENX(panfrost_new_texture)(device, &iview, tex, &payload); + GENX(panfrost_new_texture)(device, &iview, tex, &payload); } static void panfrost_update_sampler_view(struct panfrost_sampler_view *view, struct pipe_context *pctx) { - struct panfrost_resource *rsrc = pan_resource(view->base.texture); - if (view->texture_bo != rsrc->image.data.bo->ptr.gpu || - view->modifier != rsrc->image.layout.modifier) { - panfrost_bo_unreference(view->state.bo); - panfrost_create_sampler_view_bo(view, pctx, &rsrc->base); - } + struct panfrost_resource *rsrc = pan_resource(view->base.texture); + if (view->texture_bo != rsrc->image.data.bo->ptr.gpu || + view->modifier != rsrc->image.layout.modifier) { + panfrost_bo_unreference(view->state.bo); + panfrost_create_sampler_view_bo(view, pctx, &rsrc->base); + } } static mali_ptr panfrost_emit_texture_descriptors(struct panfrost_batch *batch, enum pipe_shader_type stage) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - if (!ctx->sampler_view_count[stage]) - return 0; + if (!ctx->sampler_view_count[stage]) + return 0; #if PAN_ARCH >= 6 - struct panfrost_ptr T = - pan_pool_alloc_desc_array(&batch->pool.base, - ctx->sampler_view_count[stage], - TEXTURE); - struct mali_texture_packed *out = - (struct mali_texture_packed *) T.cpu; + struct panfrost_ptr T = pan_pool_alloc_desc_array( + &batch->pool.base, ctx->sampler_view_count[stage], TEXTURE); + struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu; - for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) { - struct panfrost_sampler_view *view = ctx->sampler_views[stage][i]; + for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) { + struct panfrost_sampler_view *view = ctx->sampler_views[stage][i]; - if (!view) { - memset(&out[i], 0, sizeof(out[i])); - continue; - } + if (!view) { + memset(&out[i], 0, sizeof(out[i])); + continue; + } - struct pipe_sampler_view *pview = &view->base; - struct panfrost_resource *rsrc = pan_resource(pview->texture); + struct pipe_sampler_view *pview = &view->base; + struct panfrost_resource *rsrc = pan_resource(pview->texture); - panfrost_update_sampler_view(view, &ctx->base); - out[i] = view->bifrost_descriptor; + panfrost_update_sampler_view(view, &ctx->base); + out[i] = view->bifrost_descriptor; - panfrost_batch_read_rsrc(batch, rsrc, stage); - panfrost_batch_add_bo(batch, view->state.bo, stage); - } + panfrost_batch_read_rsrc(batch, rsrc, stage); + panfrost_batch_add_bo(batch, view->state.bo, stage); + } - return T.gpu; + return T.gpu; #else - uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) { - struct panfrost_sampler_view *view = ctx->sampler_views[stage][i]; + for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) { + struct panfrost_sampler_view *view = ctx->sampler_views[stage][i]; - if (!view) { - trampolines[i] = 0; - continue; - } + if (!view) { + trampolines[i] = 0; + continue; + } - panfrost_update_sampler_view(view, &ctx->base); + panfrost_update_sampler_view(view, &ctx->base); - trampolines[i] = panfrost_get_tex_desc(batch, stage, view); - } + trampolines[i] = panfrost_get_tex_desc(batch, stage, view); + } - return pan_pool_upload_aligned(&batch->pool.base, trampolines, - sizeof(uint64_t) * - ctx->sampler_view_count[stage], - sizeof(uint64_t)); + return pan_pool_upload_aligned( + &batch->pool.base, trampolines, + sizeof(uint64_t) * ctx->sampler_view_count[stage], sizeof(uint64_t)); #endif } @@ -1821,60 +1791,59 @@ static mali_ptr panfrost_emit_sampler_descriptors(struct panfrost_batch *batch, enum pipe_shader_type stage) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - if (!ctx->sampler_count[stage]) - return 0; + if (!ctx->sampler_count[stage]) + return 0; - struct panfrost_ptr T = - pan_pool_alloc_desc_array(&batch->pool.base, - ctx->sampler_count[stage], - SAMPLER); - struct mali_sampler_packed *out = (struct mali_sampler_packed *) T.cpu; + struct panfrost_ptr T = pan_pool_alloc_desc_array( + &batch->pool.base, ctx->sampler_count[stage], SAMPLER); + struct mali_sampler_packed *out = (struct mali_sampler_packed *)T.cpu; - for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) { - struct panfrost_sampler_state *st = ctx->samplers[stage][i]; + for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) { + struct panfrost_sampler_state *st = ctx->samplers[stage][i]; - out[i] = st ? st->hw : (struct mali_sampler_packed){0}; - } + out[i] = st ? st->hw : (struct mali_sampler_packed){0}; + } - return T.gpu; + return T.gpu; } #if PAN_ARCH <= 7 /* Packs all image attribute descs and attribute buffer descs. - * `first_image_buf_index` must be the index of the first image attribute buffer descriptor. + * `first_image_buf_index` must be the index of the first image attribute buffer + * descriptor. */ static void emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader, struct mali_attribute_packed *attribs, unsigned first_buf) { - struct panfrost_device *dev = pan_device(ctx->base.screen); - unsigned last_bit = util_last_bit(ctx->image_mask[shader]); + struct panfrost_device *dev = pan_device(ctx->base.screen); + unsigned last_bit = util_last_bit(ctx->image_mask[shader]); - for (unsigned i = 0; i < last_bit; ++i) { - enum pipe_format format = ctx->images[shader][i].format; + for (unsigned i = 0; i < last_bit; ++i) { + enum pipe_format format = ctx->images[shader][i].format; - pan_pack(attribs + i, ATTRIBUTE, cfg) { - /* Continuation record means 2 buffers per image */ - cfg.buffer_index = first_buf + (i * 2); - cfg.offset_enable = (PAN_ARCH <= 5); - cfg.format = dev->formats[format].hw; - } - } + pan_pack(attribs + i, ATTRIBUTE, cfg) { + /* Continuation record means 2 buffers per image */ + cfg.buffer_index = first_buf + (i * 2); + cfg.offset_enable = (PAN_ARCH <= 5); + cfg.format = dev->formats[format].hw; + } + } } static enum mali_attribute_type pan_modifier_to_attr_type(uint64_t modifier) { - switch (modifier) { - case DRM_FORMAT_MOD_LINEAR: - return MALI_ATTRIBUTE_TYPE_3D_LINEAR; - case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED: - return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED; - default: - unreachable("Invalid modifier for attribute record"); - } + switch (modifier) { + case DRM_FORMAT_MOD_LINEAR: + return MALI_ATTRIBUTE_TYPE_3D_LINEAR; + case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED: + return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED; + default: + unreachable("Invalid modifier for attribute record"); + } } static void @@ -1882,347 +1851,345 @@ emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader, struct mali_attribute_buffer_packed *bufs, unsigned first_image_buf_index) { - struct panfrost_context *ctx = batch->ctx; - unsigned last_bit = util_last_bit(ctx->image_mask[shader]); + struct panfrost_context *ctx = batch->ctx; + unsigned last_bit = util_last_bit(ctx->image_mask[shader]); - for (unsigned i = 0; i < last_bit; ++i) { - struct pipe_image_view *image = &ctx->images[shader][i]; + for (unsigned i = 0; i < last_bit; ++i) { + struct pipe_image_view *image = &ctx->images[shader][i]; - if (!(ctx->image_mask[shader] & (1 << i)) || - !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) { - /* Unused image bindings */ - pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg); - pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg); - continue; - } + if (!(ctx->image_mask[shader] & (1 << i)) || + !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) { + /* Unused image bindings */ + pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) + ; + pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg) + ; + continue; + } - struct panfrost_resource *rsrc = pan_resource(image->resource); + struct panfrost_resource *rsrc = pan_resource(image->resource); - /* TODO: MSAA */ - assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported"); + /* TODO: MSAA */ + assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported"); - bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D; - bool is_buffer = rsrc->base.target == PIPE_BUFFER; + bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D; + bool is_buffer = rsrc->base.target == PIPE_BUFFER; - unsigned offset = is_buffer ? image->u.buf.offset : - panfrost_texture_offset(&rsrc->image.layout, - image->u.tex.level, - is_3d ? 0 : image->u.tex.first_layer, - is_3d ? image->u.tex.first_layer : 0); + unsigned offset = is_buffer ? image->u.buf.offset + : panfrost_texture_offset( + &rsrc->image.layout, image->u.tex.level, + is_3d ? 0 : image->u.tex.first_layer, + is_3d ? image->u.tex.first_layer : 0); - panfrost_track_image_access(batch, shader, image); + panfrost_track_image_access(batch, shader, image); - pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) { - cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier); - cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset; - cfg.stride = util_format_get_blocksize(image->format); - cfg.size = rsrc->image.data.bo->size - offset; - } + pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) { + cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier); + cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset; + cfg.stride = util_format_get_blocksize(image->format); + cfg.size = rsrc->image.data.bo->size - offset; + } - if (is_buffer) { - pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) { - cfg.s_dimension = rsrc->base.width0 / - util_format_get_blocksize(image->format); - cfg.t_dimension = cfg.r_dimension = 1; - } + if (is_buffer) { + pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) { + cfg.s_dimension = + rsrc->base.width0 / util_format_get_blocksize(image->format); + cfg.t_dimension = cfg.r_dimension = 1; + } - continue; - } + continue; + } - pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) { - unsigned level = image->u.tex.level; + pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) { + unsigned level = image->u.tex.level; - cfg.s_dimension = u_minify(rsrc->base.width0, level); - cfg.t_dimension = u_minify(rsrc->base.height0, level); - cfg.r_dimension = is_3d ? - u_minify(rsrc->base.depth0, level) : - image->u.tex.last_layer - image->u.tex.first_layer + 1; + cfg.s_dimension = u_minify(rsrc->base.width0, level); + cfg.t_dimension = u_minify(rsrc->base.height0, level); + cfg.r_dimension = + is_3d ? u_minify(rsrc->base.depth0, level) + : image->u.tex.last_layer - image->u.tex.first_layer + 1; - cfg.row_stride = - rsrc->image.layout.slices[level].row_stride; + cfg.row_stride = rsrc->image.layout.slices[level].row_stride; - if (rsrc->base.target != PIPE_TEXTURE_2D) { - cfg.slice_stride = - panfrost_get_layer_stride(&rsrc->image.layout, - level); - } - } - } + if (rsrc->base.target != PIPE_TEXTURE_2D) { + cfg.slice_stride = + panfrost_get_layer_stride(&rsrc->image.layout, level); + } + } + } } static mali_ptr -panfrost_emit_image_attribs(struct panfrost_batch *batch, - mali_ptr *buffers, +panfrost_emit_image_attribs(struct panfrost_batch *batch, mali_ptr *buffers, enum pipe_shader_type type) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_compiled_shader *shader = ctx->prog[type]; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_compiled_shader *shader = ctx->prog[type]; - if (!shader->info.attribute_count) { - *buffers = 0; - return 0; - } + if (!shader->info.attribute_count) { + *buffers = 0; + return 0; + } - /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */ - unsigned attr_count = shader->info.attribute_count; - unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0); + /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */ + unsigned attr_count = shader->info.attribute_count; + unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0); - struct panfrost_ptr bufs = - pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER); + struct panfrost_ptr bufs = + pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER); - struct panfrost_ptr attribs = - pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE); + struct panfrost_ptr attribs = + pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE); - emit_image_attribs(ctx, type, attribs.cpu, 0); - emit_image_bufs(batch, type, bufs.cpu, 0); + emit_image_attribs(ctx, type, attribs.cpu, 0); + emit_image_bufs(batch, type, bufs.cpu, 0); - /* We need an empty attrib buf to stop the prefetching on Bifrost */ + /* We need an empty attrib buf to stop the prefetching on Bifrost */ #if PAN_ARCH >= 6 - pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)), - ATTRIBUTE_BUFFER, cfg); + pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)), + ATTRIBUTE_BUFFER, cfg) + ; #endif - *buffers = bufs.gpu; - return attribs.gpu; + *buffers = bufs.gpu; + return attribs.gpu; } static mali_ptr -panfrost_emit_vertex_data(struct panfrost_batch *batch, - mali_ptr *buffers) +panfrost_emit_vertex_data(struct panfrost_batch *batch, mali_ptr *buffers) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_vertex_state *so = ctx->vertex; - struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; - bool instanced = ctx->instance_count > 1; - uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX]; - unsigned nr_images = util_last_bit(image_mask); + struct panfrost_context *ctx = batch->ctx; + struct panfrost_vertex_state *so = ctx->vertex; + struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; + bool instanced = ctx->instance_count > 1; + uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX]; + unsigned nr_images = util_last_bit(image_mask); - /* Worst case: everything is NPOT, which is only possible if instancing - * is enabled. Otherwise single record is gauranteed. - * Also, we allocate more memory than what's needed here if either instancing - * is enabled or images are present, this can be improved. */ - unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1; - unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) + - (PAN_ARCH >= 6 ? 1 : 0); + /* Worst case: everything is NPOT, which is only possible if instancing + * is enabled. Otherwise single record is gauranteed. + * Also, we allocate more memory than what's needed here if either instancing + * is enabled or images are present, this can be improved. */ + unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1; + unsigned nr_bufs = + ((so->nr_bufs + nr_images) * bufs_per_attrib) + (PAN_ARCH >= 6 ? 1 : 0); - unsigned count = vs->info.attribute_count; + unsigned count = vs->info.attribute_count; - struct panfrost_compiled_shader *xfb = - ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb; + struct panfrost_compiled_shader *xfb = + ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb; - if (xfb) - count = MAX2(count, xfb->info.attribute_count); + if (xfb) + count = MAX2(count, xfb->info.attribute_count); #if PAN_ARCH <= 5 - /* Midgard needs vertexid/instanceid handled specially */ - bool special_vbufs = count >= PAN_VERTEX_ID; + /* Midgard needs vertexid/instanceid handled specially */ + bool special_vbufs = count >= PAN_VERTEX_ID; - if (special_vbufs) - nr_bufs += 2; + if (special_vbufs) + nr_bufs += 2; #endif - if (!nr_bufs) { - *buffers = 0; - return 0; - } + if (!nr_bufs) { + *buffers = 0; + return 0; + } - struct panfrost_ptr S = - pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs, - ATTRIBUTE_BUFFER); - struct panfrost_ptr T = - pan_pool_alloc_desc_array(&batch->pool.base, count, - ATTRIBUTE); + struct panfrost_ptr S = + pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs, ATTRIBUTE_BUFFER); + struct panfrost_ptr T = + pan_pool_alloc_desc_array(&batch->pool.base, count, ATTRIBUTE); - struct mali_attribute_buffer_packed *bufs = - (struct mali_attribute_buffer_packed *) S.cpu; + struct mali_attribute_buffer_packed *bufs = + (struct mali_attribute_buffer_packed *)S.cpu; - struct mali_attribute_packed *out = - (struct mali_attribute_packed *) T.cpu; + struct mali_attribute_packed *out = (struct mali_attribute_packed *)T.cpu; - unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 }; - unsigned k = 0; + unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = {0}; + unsigned k = 0; - for (unsigned i = 0; i < so->nr_bufs; ++i) { - unsigned vbi = so->buffers[i].vbi; - unsigned divisor = so->buffers[i].divisor; - attrib_to_buffer[i] = k; + for (unsigned i = 0; i < so->nr_bufs; ++i) { + unsigned vbi = so->buffers[i].vbi; + unsigned divisor = so->buffers[i].divisor; + attrib_to_buffer[i] = k; - if (!(ctx->vb_mask & (1 << vbi))) - continue; + if (!(ctx->vb_mask & (1 << vbi))) + continue; - struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; - struct panfrost_resource *rsrc; + struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; + struct panfrost_resource *rsrc; - rsrc = pan_resource(buf->buffer.resource); - if (!rsrc) - continue; + rsrc = pan_resource(buf->buffer.resource); + if (!rsrc) + continue; - panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); + panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); - /* Mask off lower bits, see offset fixup below */ - mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset; - mali_ptr addr = raw_addr & ~63; + /* Mask off lower bits, see offset fixup below */ + mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset; + mali_ptr addr = raw_addr & ~63; - /* Since we advanced the base pointer, we shrink the buffer - * size, but add the offset we subtracted */ - unsigned size = rsrc->base.width0 + (raw_addr - addr) - - buf->buffer_offset; + /* Since we advanced the base pointer, we shrink the buffer + * size, but add the offset we subtracted */ + unsigned size = + rsrc->base.width0 + (raw_addr - addr) - buf->buffer_offset; - /* When there is a divisor, the hardware-level divisor is - * the product of the instance divisor and the padded count */ - unsigned stride = buf->stride; - unsigned hw_divisor = ctx->padded_count * divisor; + /* When there is a divisor, the hardware-level divisor is + * the product of the instance divisor and the padded count */ + unsigned stride = buf->stride; + unsigned hw_divisor = ctx->padded_count * divisor; - if (ctx->instance_count <= 1) { - /* Per-instance would be every attribute equal */ - if (divisor) - stride = 0; + if (ctx->instance_count <= 1) { + /* Per-instance would be every attribute equal */ + if (divisor) + stride = 0; - pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { - cfg.pointer = addr; - cfg.stride = stride; - cfg.size = size; - } - } else if (!divisor) { - pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { - cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS; - cfg.pointer = addr; - cfg.stride = stride; - cfg.size = size; - cfg.divisor = ctx->padded_count; - } - } else if (util_is_power_of_two_or_zero(hw_divisor)) { - pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { - cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR; - cfg.pointer = addr; - cfg.stride = stride; - cfg.size = size; - cfg.divisor_r = __builtin_ctz(hw_divisor); - } + pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { + cfg.pointer = addr; + cfg.stride = stride; + cfg.size = size; + } + } else if (!divisor) { + pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { + cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS; + cfg.pointer = addr; + cfg.stride = stride; + cfg.size = size; + cfg.divisor = ctx->padded_count; + } + } else if (util_is_power_of_two_or_zero(hw_divisor)) { + pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { + cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR; + cfg.pointer = addr; + cfg.stride = stride; + cfg.size = size; + cfg.divisor_r = __builtin_ctz(hw_divisor); + } - } else { - unsigned shift = 0, extra_flags = 0; + } else { + unsigned shift = 0, extra_flags = 0; - unsigned magic_divisor = - panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags); + unsigned magic_divisor = + panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags); - /* Records with continuations must be aligned */ - k = ALIGN_POT(k, 2); - attrib_to_buffer[i] = k; + /* Records with continuations must be aligned */ + k = ALIGN_POT(k, 2); + attrib_to_buffer[i] = k; - pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { - cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR; - cfg.pointer = addr; - cfg.stride = stride; - cfg.size = size; + pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { + cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR; + cfg.pointer = addr; + cfg.stride = stride; + cfg.size = size; - cfg.divisor_r = shift; - cfg.divisor_e = extra_flags; - } + cfg.divisor_r = shift; + cfg.divisor_e = extra_flags; + } - pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) { - cfg.divisor_numerator = magic_divisor; - cfg.divisor = divisor; - } + pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) { + cfg.divisor_numerator = magic_divisor; + cfg.divisor = divisor; + } - ++k; - } + ++k; + } - ++k; - } + ++k; + } #if PAN_ARCH <= 5 - /* Add special gl_VertexID/gl_InstanceID buffers */ - if (special_vbufs) { - panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1); + /* Add special gl_VertexID/gl_InstanceID buffers */ + if (special_vbufs) { + panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1); - pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) { - cfg.buffer_index = k++; - cfg.format = so->formats[PAN_VERTEX_ID]; - } + pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) { + cfg.buffer_index = k++; + cfg.format = so->formats[PAN_VERTEX_ID]; + } - panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1); + panfrost_instance_id(ctx->padded_count, &bufs[k], + ctx->instance_count > 1); - pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) { - cfg.buffer_index = k++; - cfg.format = so->formats[PAN_INSTANCE_ID]; - } - } + pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) { + cfg.buffer_index = k++; + cfg.format = so->formats[PAN_INSTANCE_ID]; + } + } #endif - if (nr_images) { - k = ALIGN_POT(k, 2); - emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k); - emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k); - k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2); - } + if (nr_images) { + k = ALIGN_POT(k, 2); + emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k); + emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k); + k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2); + } #if PAN_ARCH >= 6 - /* We need an empty attrib buf to stop the prefetching on Bifrost */ - pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg); + /* We need an empty attrib buf to stop the prefetching on Bifrost */ + pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg) + ; #endif - /* Attribute addresses require 64-byte alignment, so let: - * - * base' = base & ~63 = base - (base & 63) - * offset' = offset + (base & 63) - * - * Since base' + offset' = base + offset, these are equivalent - * addressing modes and now base is 64 aligned. - */ + /* Attribute addresses require 64-byte alignment, so let: + * + * base' = base & ~63 = base - (base & 63) + * offset' = offset + (base & 63) + * + * Since base' + offset' = base + offset, these are equivalent + * addressing modes and now base is 64 aligned. + */ - /* While these are usually equal, they are not required to be. In some - * cases, u_blitter passes too high a value for num_elements. - */ - assert(vs->info.attributes_read_count <= so->num_elements); + /* While these are usually equal, they are not required to be. In some + * cases, u_blitter passes too high a value for num_elements. + */ + assert(vs->info.attributes_read_count <= so->num_elements); - for (unsigned i = 0; i < vs->info.attributes_read_count; ++i) { - unsigned vbi = so->pipe[i].vertex_buffer_index; - struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; + for (unsigned i = 0; i < vs->info.attributes_read_count; ++i) { + unsigned vbi = so->pipe[i].vertex_buffer_index; + struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; - /* BOs are aligned; just fixup for buffer_offset */ - signed src_offset = so->pipe[i].src_offset; - src_offset += (buf->buffer_offset & 63); + /* BOs are aligned; just fixup for buffer_offset */ + signed src_offset = so->pipe[i].src_offset; + src_offset += (buf->buffer_offset & 63); - /* Base instance offset */ - if (ctx->base_instance && so->pipe[i].instance_divisor) { - src_offset += (ctx->base_instance * buf->stride) / - so->pipe[i].instance_divisor; - } + /* Base instance offset */ + if (ctx->base_instance && so->pipe[i].instance_divisor) { + src_offset += + (ctx->base_instance * buf->stride) / so->pipe[i].instance_divisor; + } - /* Also, somewhat obscurely per-instance data needs to be - * offset in response to a delayed start in an indexed draw */ + /* Also, somewhat obscurely per-instance data needs to be + * offset in response to a delayed start in an indexed draw */ - if (so->pipe[i].instance_divisor && ctx->instance_count > 1) - src_offset -= buf->stride * ctx->offset_start; + if (so->pipe[i].instance_divisor && ctx->instance_count > 1) + src_offset -= buf->stride * ctx->offset_start; - pan_pack(out + i, ATTRIBUTE, cfg) { - cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]]; - cfg.format = so->formats[i]; - cfg.offset = src_offset; - } - } + pan_pack(out + i, ATTRIBUTE, cfg) { + cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]]; + cfg.format = so->formats[i]; + cfg.offset = src_offset; + } + } - *buffers = S.gpu; - return T.gpu; + *buffers = S.gpu; + return T.gpu; } static mali_ptr panfrost_emit_varyings(struct panfrost_batch *batch, - struct mali_attribute_buffer_packed *slot, - unsigned stride, unsigned count) + struct mali_attribute_buffer_packed *slot, + unsigned stride, unsigned count) { - unsigned size = stride * count; - mali_ptr ptr = - pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu; + unsigned size = stride * count; + mali_ptr ptr = + pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu; - pan_pack(slot, ATTRIBUTE_BUFFER, cfg) { - cfg.stride = stride; - cfg.size = size; - cfg.pointer = ptr; - } + pan_pack(slot, ATTRIBUTE_BUFFER, cfg) { + cfg.stride = stride; + cfg.size = size; + cfg.pointer = ptr; + } - return ptr; + return ptr; } /* Given a varying, figure out which index it corresponds to */ @@ -2230,7 +2197,7 @@ panfrost_emit_varyings(struct panfrost_batch *batch, static inline unsigned pan_varying_index(unsigned present, enum pan_special_varying v) { - return util_bitcount(present & BITFIELD_MASK(v)); + return util_bitcount(present & BITFIELD_MASK(v)); } /* Determines which varying buffers are required */ @@ -2238,59 +2205,58 @@ pan_varying_index(unsigned present, enum pan_special_varying v) static inline unsigned pan_varying_present(const struct panfrost_device *dev, struct pan_shader_info *producer, - struct pan_shader_info *consumer, - uint16_t point_coord_mask) + struct pan_shader_info *consumer, uint16_t point_coord_mask) { - /* At the moment we always emit general and position buffers. Not - * strictly necessary but usually harmless */ + /* At the moment we always emit general and position buffers. Not + * strictly necessary but usually harmless */ - unsigned present = BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION); + unsigned present = + BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION); - /* Enable special buffers by the shader info */ + /* Enable special buffers by the shader info */ - if (producer->vs.writes_point_size) - present |= BITFIELD_BIT(PAN_VARY_PSIZ); + if (producer->vs.writes_point_size) + present |= BITFIELD_BIT(PAN_VARY_PSIZ); #if PAN_ARCH <= 5 - /* On Midgard, these exist as real varyings. Later architectures use - * LD_VAR_SPECIAL reads instead. */ + /* On Midgard, these exist as real varyings. Later architectures use + * LD_VAR_SPECIAL reads instead. */ - if (consumer->fs.reads_point_coord) - present |= BITFIELD_BIT(PAN_VARY_PNTCOORD); + if (consumer->fs.reads_point_coord) + present |= BITFIELD_BIT(PAN_VARY_PNTCOORD); - if (consumer->fs.reads_face) - present |= BITFIELD_BIT(PAN_VARY_FACE); + if (consumer->fs.reads_face) + present |= BITFIELD_BIT(PAN_VARY_FACE); - if (consumer->fs.reads_frag_coord) - present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD); + if (consumer->fs.reads_frag_coord) + present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD); - /* Also, if we have a point sprite, we need a point coord buffer */ + /* Also, if we have a point sprite, we need a point coord buffer */ - for (unsigned i = 0; i < consumer->varyings.input_count; i++) { - gl_varying_slot loc = consumer->varyings.input[i].location; + for (unsigned i = 0; i < consumer->varyings.input_count; i++) { + gl_varying_slot loc = consumer->varyings.input[i].location; - if (util_varying_is_point_coord(loc, point_coord_mask)) - present |= BITFIELD_BIT(PAN_VARY_PNTCOORD); - } + if (util_varying_is_point_coord(loc, point_coord_mask)) + present |= BITFIELD_BIT(PAN_VARY_PNTCOORD); + } #endif - return present; + return present; } /* Emitters for varying records */ static void pan_emit_vary(const struct panfrost_device *dev, - struct mali_attribute_packed *out, - unsigned buffer_index, + struct mali_attribute_packed *out, unsigned buffer_index, mali_pixel_format format, unsigned offset) { - pan_pack(out, ATTRIBUTE, cfg) { - cfg.buffer_index = buffer_index; - cfg.offset_enable = (PAN_ARCH <= 5); - cfg.format = format; - cfg.offset = offset; - } + pan_pack(out, ATTRIBUTE, cfg) { + cfg.buffer_index = buffer_index; + cfg.offset_enable = (PAN_ARCH <= 5); + cfg.format = format; + cfg.offset = offset; + } } /* Special records */ @@ -2310,40 +2276,40 @@ static const struct { static mali_pixel_format pan_special_format(const struct panfrost_device *dev, - enum pan_special_varying buf) + enum pan_special_varying buf) { - assert(buf < PAN_VARY_MAX); - mali_pixel_format format = (pan_varying_formats[buf].format << 12); + assert(buf < PAN_VARY_MAX); + mali_pixel_format format = (pan_varying_formats[buf].format << 12); #if PAN_ARCH <= 6 - unsigned nr = pan_varying_formats[buf].components; - format |= panfrost_get_default_swizzle(nr); + unsigned nr = pan_varying_formats[buf].components; + format |= panfrost_get_default_swizzle(nr); #endif - return format; + return format; } static void pan_emit_vary_special(const struct panfrost_device *dev, - struct mali_attribute_packed *out, - unsigned present, enum pan_special_varying buf) + struct mali_attribute_packed *out, unsigned present, + enum pan_special_varying buf) { - pan_emit_vary(dev, out, pan_varying_index(present, buf), - pan_special_format(dev, buf), 0); + pan_emit_vary(dev, out, pan_varying_index(present, buf), + pan_special_format(dev, buf), 0); } /* Negative indicates a varying is not found */ static signed -pan_find_vary(const struct pan_shader_varying *vary, - unsigned vary_count, unsigned loc) +pan_find_vary(const struct pan_shader_varying *vary, unsigned vary_count, + unsigned loc) { - for (unsigned i = 0; i < vary_count; ++i) { - if (vary[i].location == loc) - return i; - } + for (unsigned i = 0; i < vary_count; ++i) { + if (vary[i].location == loc) + return i; + } - return -1; + return -1; } /* Assign varying locations for the general buffer. Returns the calculated @@ -2353,33 +2319,31 @@ pan_find_vary(const struct pan_shader_varying *vary, static unsigned pan_assign_varyings(const struct panfrost_device *dev, struct pan_shader_info *producer, - struct pan_shader_info *consumer, - signed *offsets) + struct pan_shader_info *consumer, signed *offsets) { - unsigned producer_count = producer->varyings.output_count; - unsigned consumer_count = consumer->varyings.input_count; + unsigned producer_count = producer->varyings.output_count; + unsigned consumer_count = consumer->varyings.input_count; - const struct pan_shader_varying *producer_vars = producer->varyings.output; - const struct pan_shader_varying *consumer_vars = consumer->varyings.input; + const struct pan_shader_varying *producer_vars = producer->varyings.output; + const struct pan_shader_varying *consumer_vars = consumer->varyings.input; - unsigned stride = 0; + unsigned stride = 0; - for (unsigned i = 0; i < producer_count; ++i) { - signed loc = pan_find_vary(consumer_vars, consumer_count, - producer_vars[i].location); - enum pipe_format format = loc >= 0 ? - consumer_vars[loc].format : - PIPE_FORMAT_NONE; + for (unsigned i = 0; i < producer_count; ++i) { + signed loc = pan_find_vary(consumer_vars, consumer_count, + producer_vars[i].location); + enum pipe_format format = + loc >= 0 ? consumer_vars[loc].format : PIPE_FORMAT_NONE; - if (format != PIPE_FORMAT_NONE) { - offsets[i] = stride; - stride += util_format_get_blocksize(format); - } else { - offsets[i] = -1; - } - } + if (format != PIPE_FORMAT_NONE) { + offsets[i] = stride; + stride += util_format_get_blocksize(format); + } else { + offsets[i] = -1; + } + } - return stride; + return stride; } /* Emitter for a single varying (attribute) descriptor */ @@ -2388,225 +2352,208 @@ static void panfrost_emit_varying(const struct panfrost_device *dev, struct mali_attribute_packed *out, const struct pan_shader_varying varying, - enum pipe_format pipe_format, - unsigned present, - uint16_t point_sprite_mask, - signed offset, + enum pipe_format pipe_format, unsigned present, + uint16_t point_sprite_mask, signed offset, enum pan_special_varying pos_varying) { - /* Note: varying.format != pipe_format in some obscure cases due to a - * limitation of the NIR linker. This should be fixed in the future to - * eliminate the additional lookups. See: - * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex - */ - gl_varying_slot loc = varying.location; - mali_pixel_format format = dev->formats[pipe_format].hw; + /* Note: varying.format != pipe_format in some obscure cases due to a + * limitation of the NIR linker. This should be fixed in the future to + * eliminate the additional lookups. See: + * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex + */ + gl_varying_slot loc = varying.location; + mali_pixel_format format = dev->formats[pipe_format].hw; - if (util_varying_is_point_coord(loc, point_sprite_mask)) { - pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD); - } else if (loc == VARYING_SLOT_POS) { - pan_emit_vary_special(dev, out, present, pos_varying); - } else if (loc == VARYING_SLOT_PSIZ) { - pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ); - } else if (loc == VARYING_SLOT_FACE) { - pan_emit_vary_special(dev, out, present, PAN_VARY_FACE); - } else if (offset < 0) { - pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0); - } else { - STATIC_ASSERT(PAN_VARY_GENERAL == 0); - pan_emit_vary(dev, out, 0, format, offset); - } + if (util_varying_is_point_coord(loc, point_sprite_mask)) { + pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD); + } else if (loc == VARYING_SLOT_POS) { + pan_emit_vary_special(dev, out, present, pos_varying); + } else if (loc == VARYING_SLOT_PSIZ) { + pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ); + } else if (loc == VARYING_SLOT_FACE) { + pan_emit_vary_special(dev, out, present, PAN_VARY_FACE); + } else if (offset < 0) { + pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0); + } else { + STATIC_ASSERT(PAN_VARY_GENERAL == 0); + pan_emit_vary(dev, out, 0, format, offset); + } } /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time, * rather than draw time (under good conditions). */ static void -panfrost_emit_varying_descs( - struct panfrost_pool *pool, - struct panfrost_compiled_shader *producer, - struct panfrost_compiled_shader *consumer, - uint16_t point_coord_mask, - struct pan_linkage *out) +panfrost_emit_varying_descs(struct panfrost_pool *pool, + struct panfrost_compiled_shader *producer, + struct panfrost_compiled_shader *consumer, + uint16_t point_coord_mask, struct pan_linkage *out) { - struct panfrost_device *dev = pool->base.dev; - unsigned producer_count = producer->info.varyings.output_count; - unsigned consumer_count = consumer->info.varyings.input_count; + struct panfrost_device *dev = pool->base.dev; + unsigned producer_count = producer->info.varyings.output_count; + unsigned consumer_count = consumer->info.varyings.input_count; - /* Offsets within the general varying buffer, indexed by location */ - signed offsets[PAN_MAX_VARYINGS]; - assert(producer_count <= ARRAY_SIZE(offsets)); - assert(consumer_count <= ARRAY_SIZE(offsets)); + /* Offsets within the general varying buffer, indexed by location */ + signed offsets[PAN_MAX_VARYINGS]; + assert(producer_count <= ARRAY_SIZE(offsets)); + assert(consumer_count <= ARRAY_SIZE(offsets)); - /* Allocate enough descriptors for both shader stages */ - struct panfrost_ptr T = - pan_pool_alloc_desc_array(&pool->base, - producer_count + consumer_count, - ATTRIBUTE); + /* Allocate enough descriptors for both shader stages */ + struct panfrost_ptr T = pan_pool_alloc_desc_array( + &pool->base, producer_count + consumer_count, ATTRIBUTE); - /* Take a reference if we're being put on the CSO */ - if (!pool->owned) { - out->bo = pool->transient_bo; - panfrost_bo_reference(out->bo); - } + /* Take a reference if we're being put on the CSO */ + if (!pool->owned) { + out->bo = pool->transient_bo; + panfrost_bo_reference(out->bo); + } - struct mali_attribute_packed *descs = T.cpu; - out->producer = producer_count ? T.gpu : 0; - out->consumer = consumer_count ? T.gpu + - (pan_size(ATTRIBUTE) * producer_count) : 0; + struct mali_attribute_packed *descs = T.cpu; + out->producer = producer_count ? T.gpu : 0; + out->consumer = + consumer_count ? T.gpu + (pan_size(ATTRIBUTE) * producer_count) : 0; - /* Lay out the varyings. Must use producer to lay out, in order to - * respect transform feedback precisions. */ - out->present = pan_varying_present(dev, &producer->info, - &consumer->info, point_coord_mask); + /* Lay out the varyings. Must use producer to lay out, in order to + * respect transform feedback precisions. */ + out->present = pan_varying_present(dev, &producer->info, &consumer->info, + point_coord_mask); - out->stride = pan_assign_varyings(dev, &producer->info, - &consumer->info, offsets); + out->stride = + pan_assign_varyings(dev, &producer->info, &consumer->info, offsets); - for (unsigned i = 0; i < producer_count; ++i) { - signed j = pan_find_vary(consumer->info.varyings.input, - consumer->info.varyings.input_count, - producer->info.varyings.output[i].location); + for (unsigned i = 0; i < producer_count; ++i) { + signed j = pan_find_vary(consumer->info.varyings.input, + consumer->info.varyings.input_count, + producer->info.varyings.output[i].location); - enum pipe_format format = (j >= 0) ? - consumer->info.varyings.input[j].format : - producer->info.varyings.output[i].format; + enum pipe_format format = (j >= 0) + ? consumer->info.varyings.input[j].format + : producer->info.varyings.output[i].format; - panfrost_emit_varying(dev, descs + i, - producer->info.varyings.output[i], format, - out->present, 0, offsets[i], PAN_VARY_POSITION); - } + panfrost_emit_varying(dev, descs + i, producer->info.varyings.output[i], + format, out->present, 0, offsets[i], + PAN_VARY_POSITION); + } - for (unsigned i = 0; i < consumer_count; ++i) { - signed j = pan_find_vary(producer->info.varyings.output, - producer->info.varyings.output_count, - consumer->info.varyings.input[i].location); + for (unsigned i = 0; i < consumer_count; ++i) { + signed j = pan_find_vary(producer->info.varyings.output, + producer->info.varyings.output_count, + consumer->info.varyings.input[i].location); - signed offset = (j >= 0) ? offsets[j] : -1; + signed offset = (j >= 0) ? offsets[j] : -1; - panfrost_emit_varying(dev, descs + producer_count + i, - consumer->info.varyings.input[i], - consumer->info.varyings.input[i].format, - out->present, point_coord_mask, - offset, PAN_VARY_FRAGCOORD); - } + panfrost_emit_varying( + dev, descs + producer_count + i, consumer->info.varyings.input[i], + consumer->info.varyings.input[i].format, out->present, + point_coord_mask, offset, PAN_VARY_FRAGCOORD); + } } #if PAN_ARCH <= 5 static void pan_emit_special_input(struct mali_attribute_buffer_packed *out, - unsigned present, - enum pan_special_varying v, - unsigned special) + unsigned present, enum pan_special_varying v, + unsigned special) { - if (present & BITFIELD_BIT(v)) { - unsigned idx = pan_varying_index(present, v); + if (present & BITFIELD_BIT(v)) { + unsigned idx = pan_varying_index(present, v); - pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) { - cfg.special = special; - cfg.type = 0; - } - } + pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) { + cfg.special = special; + cfg.type = 0; + } + } } #endif static void panfrost_emit_varying_descriptor(struct panfrost_batch *batch, - unsigned vertex_count, - mali_ptr *vs_attribs, - mali_ptr *fs_attribs, - mali_ptr *buffers, - unsigned *buffer_count, - mali_ptr *position, - mali_ptr *psiz, - bool point_coord_replace) + unsigned vertex_count, mali_ptr *vs_attribs, + mali_ptr *fs_attribs, mali_ptr *buffers, + unsigned *buffer_count, mali_ptr *position, + mali_ptr *psiz, bool point_coord_replace) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; - struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; + struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; - uint16_t point_coord_mask = 0; + uint16_t point_coord_mask = 0; #if PAN_ARCH <= 5 - struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; - /* Point sprites are lowered on Bifrost and newer */ - if (point_coord_replace) - point_coord_mask = ctx->rasterizer->base.sprite_coord_enable; + /* Point sprites are lowered on Bifrost and newer */ + if (point_coord_replace) + point_coord_mask = ctx->rasterizer->base.sprite_coord_enable; #endif - /* In good conditions, we only need to link varyings once */ - bool prelink = - (point_coord_mask == 0) && - !vs->info.separable && - !fs->info.separable; + /* In good conditions, we only need to link varyings once */ + bool prelink = + (point_coord_mask == 0) && !vs->info.separable && !fs->info.separable; - /* Try to reduce copies */ - struct pan_linkage _linkage; - struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage; + /* Try to reduce copies */ + struct pan_linkage _linkage; + struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage; - /* Emit ATTRIBUTE descriptors if needed */ - if (!prelink || vs->linkage.bo == NULL) { - struct panfrost_pool *pool = - prelink ? &ctx->descs : &batch->pool; + /* Emit ATTRIBUTE descriptors if needed */ + if (!prelink || vs->linkage.bo == NULL) { + struct panfrost_pool *pool = prelink ? &ctx->descs : &batch->pool; - panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage); - } + panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage); + } - unsigned present = linkage->present, stride = linkage->stride; - unsigned count = util_bitcount(present); - struct panfrost_ptr T = - pan_pool_alloc_desc_array(&batch->pool.base, - count + 1, - ATTRIBUTE_BUFFER); - struct mali_attribute_buffer_packed *varyings = - (struct mali_attribute_buffer_packed *) T.cpu; + unsigned present = linkage->present, stride = linkage->stride; + unsigned count = util_bitcount(present); + struct panfrost_ptr T = + pan_pool_alloc_desc_array(&batch->pool.base, count + 1, ATTRIBUTE_BUFFER); + struct mali_attribute_buffer_packed *varyings = + (struct mali_attribute_buffer_packed *)T.cpu; - if (buffer_count) - *buffer_count = count; + if (buffer_count) + *buffer_count = count; #if PAN_ARCH >= 6 - /* Suppress prefetch on Bifrost */ - memset(varyings + count, 0, sizeof(*varyings)); + /* Suppress prefetch on Bifrost */ + memset(varyings + count, 0, sizeof(*varyings)); #endif - if (stride) { - panfrost_emit_varyings(batch, - &varyings[pan_varying_index(present, PAN_VARY_GENERAL)], - stride, vertex_count); - } else { - /* The indirect draw code reads the stride field, make sure - * that it is initialised */ - memset(varyings + pan_varying_index(present, PAN_VARY_GENERAL), 0, - sizeof(*varyings)); - } + if (stride) { + panfrost_emit_varyings( + batch, &varyings[pan_varying_index(present, PAN_VARY_GENERAL)], stride, + vertex_count); + } else { + /* The indirect draw code reads the stride field, make sure + * that it is initialised */ + memset(varyings + pan_varying_index(present, PAN_VARY_GENERAL), 0, + sizeof(*varyings)); + } - /* fp32 vec4 gl_Position */ - *position = panfrost_emit_varyings(batch, - &varyings[pan_varying_index(present, PAN_VARY_POSITION)], - sizeof(float) * 4, vertex_count); + /* fp32 vec4 gl_Position */ + *position = panfrost_emit_varyings( + batch, &varyings[pan_varying_index(present, PAN_VARY_POSITION)], + sizeof(float) * 4, vertex_count); - if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) { - *psiz = panfrost_emit_varyings(batch, - &varyings[pan_varying_index(present, PAN_VARY_PSIZ)], - 2, vertex_count); - } + if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) { + *psiz = panfrost_emit_varyings( + batch, &varyings[pan_varying_index(present, PAN_VARY_PSIZ)], 2, + vertex_count); + } #if PAN_ARCH <= 5 - pan_emit_special_input(varyings, present, - PAN_VARY_PNTCOORD, - (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) ? - MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MAX_Y : - MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MIN_Y); - pan_emit_special_input(varyings, present, PAN_VARY_FACE, - MALI_ATTRIBUTE_SPECIAL_FRONT_FACING); - pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, - MALI_ATTRIBUTE_SPECIAL_FRAG_COORD); + pan_emit_special_input( + varyings, present, PAN_VARY_PNTCOORD, + (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) + ? MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MAX_Y + : MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MIN_Y); + pan_emit_special_input(varyings, present, PAN_VARY_FACE, + MALI_ATTRIBUTE_SPECIAL_FRONT_FACING); + pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, + MALI_ATTRIBUTE_SPECIAL_FRAG_COORD); #endif - *buffers = T.gpu; - *vs_attribs = linkage->producer; - *fs_attribs = linkage->consumer; + *buffers = T.gpu; + *vs_attribs = linkage->producer; + *fs_attribs = linkage->consumer; } /* @@ -2619,64 +2566,60 @@ panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch, const struct panfrost_ptr *vertex_job, const struct panfrost_ptr *tiler_job) { - unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard, - MALI_JOB_TYPE_VERTEX, false, false, - 0, 0, vertex_job, false); + unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_VERTEX, false, false, 0, 0, + vertex_job, false); - panfrost_add_job(&batch->pool.base, &batch->scoreboard, - MALI_JOB_TYPE_TILER, false, false, - vertex, 0, tiler_job, false); + panfrost_add_job(&batch->pool.base, &batch->scoreboard, MALI_JOB_TYPE_TILER, + false, false, vertex, 0, tiler_job, false); } #endif static void emit_tls(struct panfrost_batch *batch) { - struct panfrost_device *dev = pan_device(batch->ctx->base.screen); + struct panfrost_device *dev = pan_device(batch->ctx->base.screen); - /* Emitted with the FB descriptor on Midgard. */ - if (PAN_ARCH <= 5 && batch->framebuffer.gpu) - return; + /* Emitted with the FB descriptor on Midgard. */ + if (PAN_ARCH <= 5 && batch->framebuffer.gpu) + return; - struct panfrost_bo *tls_bo = - batch->stack_size ? - panfrost_batch_get_scratchpad(batch, - batch->stack_size, - dev->thread_tls_alloc, - dev->core_id_range): - NULL; - struct pan_tls_info tls = { - .tls = { - .ptr = tls_bo ? tls_bo->ptr.gpu : 0, - .size = batch->stack_size, - }, - }; + struct panfrost_bo *tls_bo = + batch->stack_size ? panfrost_batch_get_scratchpad( + batch, batch->stack_size, dev->thread_tls_alloc, + dev->core_id_range) + : NULL; + struct pan_tls_info tls = { + .tls = + { + .ptr = tls_bo ? tls_bo->ptr.gpu : 0, + .size = batch->stack_size, + }, + }; - assert(batch->tls.cpu); - GENX(pan_emit_tls)(&tls, batch->tls.cpu); + assert(batch->tls.cpu); + GENX(pan_emit_tls)(&tls, batch->tls.cpu); } static void emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb) { - struct panfrost_device *dev = pan_device(batch->ctx->base.screen); - struct panfrost_bo *tls_bo = - batch->stack_size ? - panfrost_batch_get_scratchpad(batch, - batch->stack_size, - dev->thread_tls_alloc, - dev->core_id_range): - NULL; - struct pan_tls_info tls = { - .tls = { - .ptr = tls_bo ? tls_bo->ptr.gpu : 0, - .size = batch->stack_size, - }, - }; + struct panfrost_device *dev = pan_device(batch->ctx->base.screen); + struct panfrost_bo *tls_bo = + batch->stack_size ? panfrost_batch_get_scratchpad( + batch, batch->stack_size, dev->thread_tls_alloc, + dev->core_id_range) + : NULL; + struct pan_tls_info tls = { + .tls = + { + .ptr = tls_bo ? tls_bo->ptr.gpu : 0, + .size = batch->stack_size, + }, + }; - batch->framebuffer.gpu |= - GENX(pan_emit_fbd)(dev, fb, &tls, &batch->tiler_ctx, - batch->framebuffer.cpu); + batch->framebuffer.gpu |= GENX(pan_emit_fbd)( + dev, fb, &tls, &batch->tiler_ctx, batch->framebuffer.cpu); } /* Mark a surface as written */ @@ -2685,10 +2628,10 @@ static void panfrost_initialize_surface(struct panfrost_batch *batch, struct pipe_surface *surf) { - if (surf) { - struct panfrost_resource *rsrc = pan_resource(surf->texture); - BITSET_SET(rsrc->valid.data, surf->u.tex.level); - } + if (surf) { + struct panfrost_resource *rsrc = pan_resource(surf->texture); + BITSET_SET(rsrc->valid.data, surf->u.tex.level); + } } /* Generate a fragment job. This should be called once per frame. (Usually, @@ -2697,67 +2640,68 @@ panfrost_initialize_surface(struct panfrost_batch *batch, static mali_ptr emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb) { - /* Mark the affected buffers as initialized, since we're writing to it. - * Also, add the surfaces we're writing to to the batch */ + /* Mark the affected buffers as initialized, since we're writing to it. + * Also, add the surfaces we're writing to to the batch */ - struct pipe_framebuffer_state *fb = &batch->key; + struct pipe_framebuffer_state *fb = &batch->key; - for (unsigned i = 0; i < fb->nr_cbufs; ++i) - panfrost_initialize_surface(batch, fb->cbufs[i]); + for (unsigned i = 0; i < fb->nr_cbufs; ++i) + panfrost_initialize_surface(batch, fb->cbufs[i]); - panfrost_initialize_surface(batch, fb->zsbuf); + panfrost_initialize_surface(batch, fb->zsbuf); - /* The passed tile coords can be out of range in some cases, so we need - * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT. - * Theoretically we also need to clamp the coordinates positive, but we - * avoid that edge case as all four values are unsigned. Also, - * theoretically we could clamp the minima, but if that has to happen - * the asserts would fail anyway (since the maxima would get clamped - * and then be smaller than the minima). An edge case of sorts occurs - * when no scissors are added to draw, so by default min=~0 and max=0. - * But that can't happen if any actual drawing occurs (beyond a - * wallpaper reload), so this is again irrelevant in practice. */ + /* The passed tile coords can be out of range in some cases, so we need + * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT. + * Theoretically we also need to clamp the coordinates positive, but we + * avoid that edge case as all four values are unsigned. Also, + * theoretically we could clamp the minima, but if that has to happen + * the asserts would fail anyway (since the maxima would get clamped + * and then be smaller than the minima). An edge case of sorts occurs + * when no scissors are added to draw, so by default min=~0 and max=0. + * But that can't happen if any actual drawing occurs (beyond a + * wallpaper reload), so this is again irrelevant in practice. */ - batch->maxx = MIN2(batch->maxx, fb->width); - batch->maxy = MIN2(batch->maxy, fb->height); + batch->maxx = MIN2(batch->maxx, fb->width); + batch->maxy = MIN2(batch->maxy, fb->height); - /* Rendering region must be at least 1x1; otherwise, there is nothing - * to do and the whole job chain should have been discarded. */ + /* Rendering region must be at least 1x1; otherwise, there is nothing + * to do and the whole job chain should have been discarded. */ - assert(batch->maxx > batch->minx); - assert(batch->maxy > batch->miny); + assert(batch->maxx > batch->minx); + assert(batch->maxy > batch->miny); - struct panfrost_ptr transfer = - pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB); + struct panfrost_ptr transfer = + pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB); - GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu, - transfer.cpu); + GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu, transfer.cpu); - return transfer.gpu; + return transfer.gpu; } -#define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c; +#define DEFINE_CASE(c) \ + case PIPE_PRIM_##c: \ + return MALI_DRAW_MODE_##c; static uint8_t pan_draw_mode(enum pipe_prim_type mode) { - switch (mode) { - DEFINE_CASE(POINTS); - DEFINE_CASE(LINES); - DEFINE_CASE(LINE_LOOP); - DEFINE_CASE(LINE_STRIP); - DEFINE_CASE(TRIANGLES); - DEFINE_CASE(TRIANGLE_STRIP); - DEFINE_CASE(TRIANGLE_FAN); - DEFINE_CASE(QUADS); - DEFINE_CASE(POLYGON); + switch (mode) { + DEFINE_CASE(POINTS); + DEFINE_CASE(LINES); + DEFINE_CASE(LINE_LOOP); + DEFINE_CASE(LINE_STRIP); + DEFINE_CASE(TRIANGLES); + DEFINE_CASE(TRIANGLE_STRIP); + DEFINE_CASE(TRIANGLE_FAN); + DEFINE_CASE(QUADS); + DEFINE_CASE(POLYGON); #if PAN_ARCH <= 6 - DEFINE_CASE(QUAD_STRIP); + DEFINE_CASE(QUAD_STRIP); #endif - default: - unreachable("Invalid draw mode"); - } + default: + unreachable("Invalid draw mode"); + } } #undef DEFINE_CASE @@ -2766,61 +2710,60 @@ pan_draw_mode(enum pipe_prim_type mode) * transform feedback */ static void -panfrost_statistics_record( - struct panfrost_context *ctx, - const struct pipe_draw_info *info, - const struct pipe_draw_start_count_bias *draw) +panfrost_statistics_record(struct panfrost_context *ctx, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw) { - if (!ctx->active_queries) - return; + if (!ctx->active_queries) + return; - uint32_t prims = u_prims_for_vertices(info->mode, draw->count); - ctx->prims_generated += prims; + uint32_t prims = u_prims_for_vertices(info->mode, draw->count); + ctx->prims_generated += prims; - if (!ctx->streamout.num_targets) - return; + if (!ctx->streamout.num_targets) + return; - ctx->tf_prims_generated += prims; - ctx->dirty |= PAN_DIRTY_SO; + ctx->tf_prims_generated += prims; + ctx->dirty |= PAN_DIRTY_SO; } static void panfrost_update_streamout_offsets(struct panfrost_context *ctx) { - unsigned count = u_stream_outputs_for_vertices(ctx->active_prim, - ctx->vertex_count); + unsigned count = + u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count); - for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) { - if (!ctx->streamout.targets[i]) - continue; + for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) { + if (!ctx->streamout.targets[i]) + continue; - pan_so_target(ctx->streamout.targets[i])->offset += count; - } + pan_so_target(ctx->streamout.targets[i])->offset += count; + } } static inline enum mali_index_type panfrost_translate_index_size(unsigned size) { - STATIC_ASSERT(MALI_INDEX_TYPE_NONE == 0); - STATIC_ASSERT(MALI_INDEX_TYPE_UINT8 == 1); - STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2); + STATIC_ASSERT(MALI_INDEX_TYPE_NONE == 0); + STATIC_ASSERT(MALI_INDEX_TYPE_UINT8 == 1); + STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2); - return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size; + return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size; } #if PAN_ARCH <= 7 static inline void -pan_emit_draw_descs(struct panfrost_batch *batch, - struct MALI_DRAW *d, enum pipe_shader_type st) +pan_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d, + enum pipe_shader_type st) { - d->offset_start = batch->ctx->offset_start; - d->instance_size = batch->ctx->instance_count > 1 ? - batch->ctx->padded_count : 1; + d->offset_start = batch->ctx->offset_start; + d->instance_size = + batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1; - d->uniform_buffers = batch->uniform_buffers[st]; - d->push_uniforms = batch->push_uniforms[st]; - d->textures = batch->textures[st]; - d->samplers = batch->samplers[st]; + d->uniform_buffers = batch->uniform_buffers[st]; + d->push_uniforms = batch->push_uniforms[st]; + d->textures = batch->textures[st]; + d->samplers = batch->samplers[st]; } static void @@ -2829,64 +2772,59 @@ panfrost_draw_emit_vertex_section(struct panfrost_batch *batch, mali_ptr attribs, mali_ptr attrib_bufs, void *section) { - pan_pack(section, DRAW, cfg) { - cfg.state = batch->rsd[PIPE_SHADER_VERTEX]; - cfg.attributes = attribs; - cfg.attribute_buffers = attrib_bufs; - cfg.varyings = vs_vary; - cfg.varying_buffers = vs_vary ? varyings : 0; - cfg.thread_storage = batch->tls.gpu; - pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX); - } + pan_pack(section, DRAW, cfg) { + cfg.state = batch->rsd[PIPE_SHADER_VERTEX]; + cfg.attributes = attribs; + cfg.attribute_buffers = attrib_bufs; + cfg.varyings = vs_vary; + cfg.varying_buffers = vs_vary ? varyings : 0; + cfg.thread_storage = batch->tls.gpu; + pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX); + } } static void panfrost_draw_emit_vertex(struct panfrost_batch *batch, const struct pipe_draw_info *info, - void *invocation_template, - mali_ptr vs_vary, mali_ptr varyings, - mali_ptr attribs, mali_ptr attrib_bufs, - void *job) + void *invocation_template, mali_ptr vs_vary, + mali_ptr varyings, mali_ptr attribs, + mali_ptr attrib_bufs, void *job) { - void *section = - pan_section_ptr(job, COMPUTE_JOB, INVOCATION); - memcpy(section, invocation_template, pan_size(INVOCATION)); + void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION); + memcpy(section, invocation_template, pan_size(INVOCATION)); - pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) { - cfg.job_task_split = 5; - } + pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) { + cfg.job_task_split = 5; + } - section = pan_section_ptr(job, COMPUTE_JOB, DRAW); - panfrost_draw_emit_vertex_section(batch, vs_vary, varyings, - attribs, attrib_bufs, section); + section = pan_section_ptr(job, COMPUTE_JOB, DRAW); + panfrost_draw_emit_vertex_section(batch, vs_vary, varyings, attribs, + attrib_bufs, section); } #endif static void -panfrost_emit_primitive_size(struct panfrost_context *ctx, - bool points, mali_ptr size_array, - void *prim_size) +panfrost_emit_primitive_size(struct panfrost_context *ctx, bool points, + mali_ptr size_array, void *prim_size) { - struct panfrost_rasterizer *rast = ctx->rasterizer; + struct panfrost_rasterizer *rast = ctx->rasterizer; - pan_pack(prim_size, PRIMITIVE_SIZE, cfg) { - if (panfrost_writes_point_size(ctx)) { - cfg.size_array = size_array; - } else { - cfg.constant = points ? - rast->base.point_size : - rast->base.line_width; - } - } + pan_pack(prim_size, PRIMITIVE_SIZE, cfg) { + if (panfrost_writes_point_size(ctx)) { + cfg.size_array = size_array; + } else { + cfg.constant = points ? rast->base.point_size : rast->base.line_width; + } + } } static bool panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info) { - /* As a reminder primitive_restart should always be checked before any - access to restart_index. */ - return info->primitive_restart && - info->restart_index == (unsigned)BITFIELD_MASK(info->index_size * 8); + /* As a reminder primitive_restart should always be checked before any + access to restart_index. */ + return info->primitive_restart && + info->restart_index == (unsigned)BITFIELD_MASK(info->index_size * 8); } /* On Bifrost and older, the Renderer State Descriptor aggregates many pieces of @@ -2900,128 +2838,125 @@ panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info) * specified in the draw call descriptor, but must be considered when determing * early-Z state which is part of the RSD. */ -#define FRAGMENT_RSD_DIRTY_MASK ( \ - PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | \ - PAN_DIRTY_RASTERIZER | PAN_DIRTY_OQ) +#define FRAGMENT_RSD_DIRTY_MASK \ + (PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | PAN_DIRTY_RASTERIZER | \ + PAN_DIRTY_OQ) static inline void panfrost_update_shader_state(struct panfrost_batch *batch, enum pipe_shader_type st) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_compiled_shader *ss = ctx->prog[st]; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_compiled_shader *ss = ctx->prog[st]; - bool frag = (st == PIPE_SHADER_FRAGMENT); - unsigned dirty_3d = ctx->dirty; - unsigned dirty = ctx->dirty_shader[st]; + bool frag = (st == PIPE_SHADER_FRAGMENT); + unsigned dirty_3d = ctx->dirty; + unsigned dirty = ctx->dirty_shader[st]; - if (dirty & PAN_DIRTY_STAGE_TEXTURE) { - batch->textures[st] = - panfrost_emit_texture_descriptors(batch, st); - } + if (dirty & PAN_DIRTY_STAGE_TEXTURE) { + batch->textures[st] = panfrost_emit_texture_descriptors(batch, st); + } - if (dirty & PAN_DIRTY_STAGE_SAMPLER) { - batch->samplers[st] = - panfrost_emit_sampler_descriptors(batch, st); - } + if (dirty & PAN_DIRTY_STAGE_SAMPLER) { + batch->samplers[st] = panfrost_emit_sampler_descriptors(batch, st); + } - /* On Bifrost and older, the fragment shader descriptor is fused - * together with the renderer state; the combined renderer state - * descriptor is emitted below. Otherwise, the shader descriptor is - * standalone and is emitted here. - */ - if ((dirty & PAN_DIRTY_STAGE_SHADER) && !((PAN_ARCH <= 7) && frag)) { - batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st); - } + /* On Bifrost and older, the fragment shader descriptor is fused + * together with the renderer state; the combined renderer state + * descriptor is emitted below. Otherwise, the shader descriptor is + * standalone and is emitted here. + */ + if ((dirty & PAN_DIRTY_STAGE_SHADER) && !((PAN_ARCH <= 7) && frag)) { + batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st); + } #if PAN_ARCH >= 9 - if (dirty & PAN_DIRTY_STAGE_IMAGE) - batch->images[st] = panfrost_emit_images(batch, st); + if (dirty & PAN_DIRTY_STAGE_IMAGE) + batch->images[st] = panfrost_emit_images(batch, st); #endif - if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) { - batch->uniform_buffers[st] = panfrost_emit_const_buf(batch, st, - NULL, &batch->push_uniforms[st], NULL); - } + if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) { + batch->uniform_buffers[st] = panfrost_emit_const_buf( + batch, st, NULL, &batch->push_uniforms[st], NULL); + } #if PAN_ARCH <= 7 - /* On Bifrost and older, if the fragment shader changes OR any renderer - * state specified with the fragment shader, the whole renderer state - * descriptor is dirtied and must be reemited. - */ - if (frag && ((dirty & PAN_DIRTY_STAGE_SHADER) || - (dirty_3d & FRAGMENT_RSD_DIRTY_MASK))) { + /* On Bifrost and older, if the fragment shader changes OR any renderer + * state specified with the fragment shader, the whole renderer state + * descriptor is dirtied and must be reemited. + */ + if (frag && ((dirty & PAN_DIRTY_STAGE_SHADER) || + (dirty_3d & FRAGMENT_RSD_DIRTY_MASK))) { - batch->rsd[st] = panfrost_emit_frag_shader_meta(batch); - } + batch->rsd[st] = panfrost_emit_frag_shader_meta(batch); + } - if (frag && (dirty & PAN_DIRTY_STAGE_IMAGE)) { - batch->attribs[st] = panfrost_emit_image_attribs(batch, - &batch->attrib_bufs[st], st); - } + if (frag && (dirty & PAN_DIRTY_STAGE_IMAGE)) { + batch->attribs[st] = + panfrost_emit_image_attribs(batch, &batch->attrib_bufs[st], st); + } #endif } static inline void panfrost_update_state_3d(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; - unsigned dirty = ctx->dirty; + struct panfrost_context *ctx = batch->ctx; + unsigned dirty = ctx->dirty; - if (dirty & PAN_DIRTY_TLS_SIZE) - panfrost_batch_adjust_stack_size(batch); + if (dirty & PAN_DIRTY_TLS_SIZE) + panfrost_batch_adjust_stack_size(batch); - if (dirty & PAN_DIRTY_BLEND) - panfrost_set_batch_masks_blend(batch); + if (dirty & PAN_DIRTY_BLEND) + panfrost_set_batch_masks_blend(batch); - if (dirty & PAN_DIRTY_ZS) - panfrost_set_batch_masks_zs(batch); + if (dirty & PAN_DIRTY_ZS) + panfrost_set_batch_masks_zs(batch); #if PAN_ARCH >= 9 - if ((dirty & (PAN_DIRTY_ZS | PAN_DIRTY_RASTERIZER)) || - (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & PAN_DIRTY_STAGE_SHADER)) - batch->depth_stencil = panfrost_emit_depth_stencil(batch); + if ((dirty & (PAN_DIRTY_ZS | PAN_DIRTY_RASTERIZER)) || + (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & PAN_DIRTY_STAGE_SHADER)) + batch->depth_stencil = panfrost_emit_depth_stencil(batch); - if (dirty & PAN_DIRTY_BLEND) - batch->blend = panfrost_emit_blend_valhall(batch); + if (dirty & PAN_DIRTY_BLEND) + batch->blend = panfrost_emit_blend_valhall(batch); - if (dirty & PAN_DIRTY_VERTEX) { - batch->attribs[PIPE_SHADER_VERTEX] = - panfrost_emit_vertex_data(batch); + if (dirty & PAN_DIRTY_VERTEX) { + batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(batch); - batch->attrib_bufs[PIPE_SHADER_VERTEX] = - panfrost_emit_vertex_buffers(batch); - } + batch->attrib_bufs[PIPE_SHADER_VERTEX] = + panfrost_emit_vertex_buffers(batch); + } #endif } #if PAN_ARCH >= 6 static mali_ptr -panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count) +panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, + unsigned vertex_count) { - struct panfrost_device *dev = pan_device(batch->ctx->base.screen); + struct panfrost_device *dev = pan_device(batch->ctx->base.screen); - if (!vertex_count) - return 0; + if (!vertex_count) + return 0; - if (batch->tiler_ctx.bifrost) - return batch->tiler_ctx.bifrost; + if (batch->tiler_ctx.bifrost) + return batch->tiler_ctx.bifrost; - struct panfrost_ptr t = - pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP); + struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP); - GENX(pan_emit_tiler_heap)(dev, t.cpu); + GENX(pan_emit_tiler_heap)(dev, t.cpu); - mali_ptr heap = t.gpu; + mali_ptr heap = t.gpu; - t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT); - GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height, - util_framebuffer_get_num_samples(&batch->key), - pan_tristate_get(batch->first_provoking_vertex), - heap, t.cpu); + t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT); + GENX(pan_emit_tiler_ctx) + (dev, batch->key.width, batch->key.height, + util_framebuffer_get_num_samples(&batch->key), + pan_tristate_get(batch->first_provoking_vertex), heap, t.cpu); - batch->tiler_ctx.bifrost = t.gpu; - return batch->tiler_ctx.bifrost; + batch->tiler_ctx.bifrost = t.gpu; + return batch->tiler_ctx.bifrost; } #endif @@ -3034,318 +2969,308 @@ panfrost_emit_primitive(struct panfrost_context *ctx, const struct pipe_draw_start_count_bias *draw, mali_ptr indices, bool secondary_shader, void *out) { - UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; - bool lines = (info->mode == PIPE_PRIM_LINES || - info->mode == PIPE_PRIM_LINE_LOOP || - info->mode == PIPE_PRIM_LINE_STRIP); + bool lines = + (info->mode == PIPE_PRIM_LINES || info->mode == PIPE_PRIM_LINE_LOOP || + info->mode == PIPE_PRIM_LINE_STRIP); - pan_pack(out, PRIMITIVE, cfg) { - cfg.draw_mode = pan_draw_mode(info->mode); - if (panfrost_writes_point_size(ctx)) - cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16; + pan_pack(out, PRIMITIVE, cfg) { + cfg.draw_mode = pan_draw_mode(info->mode); + if (panfrost_writes_point_size(ctx)) + cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16; #if PAN_ARCH <= 8 - /* For line primitives, PRIMITIVE.first_provoking_vertex must - * be set to true and the provoking vertex is selected with - * DRAW.flat_shading_vertex. - */ - if (lines) - cfg.first_provoking_vertex = true; - else - cfg.first_provoking_vertex = rast->flatshade_first; + /* For line primitives, PRIMITIVE.first_provoking_vertex must + * be set to true and the provoking vertex is selected with + * DRAW.flat_shading_vertex. + */ + if (lines) + cfg.first_provoking_vertex = true; + else + cfg.first_provoking_vertex = rast->flatshade_first; - if (panfrost_is_implicit_prim_restart(info)) { - cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT; - } else if (info->primitive_restart) { - cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT; - cfg.primitive_restart_index = info->restart_index; - } + if (panfrost_is_implicit_prim_restart(info)) { + cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT; + } else if (info->primitive_restart) { + cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT; + cfg.primitive_restart_index = info->restart_index; + } - cfg.job_task_split = 6; + cfg.job_task_split = 6; #else - struct panfrost_compiled_shader *fs = - ctx->prog[PIPE_SHADER_FRAGMENT]; + struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; - cfg.allow_rotating_primitives = !(lines || fs->info.bifrost.uses_flat_shading); - cfg.primitive_restart = info->primitive_restart; + cfg.allow_rotating_primitives = + !(lines || fs->info.bifrost.uses_flat_shading); + cfg.primitive_restart = info->primitive_restart; - /* Non-fixed restart indices should have been lowered */ - assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info)); + /* Non-fixed restart indices should have been lowered */ + assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info)); #endif - cfg.index_count = draw->count; - cfg.index_type = panfrost_translate_index_size(info->index_size); + cfg.index_count = draw->count; + cfg.index_type = panfrost_translate_index_size(info->index_size); - if (PAN_ARCH >= 9) { - /* Base vertex offset on Valhall is used for both - * indexed and non-indexed draws, in a simple way for - * either. Handle both cases. - */ - if (cfg.index_type) - cfg.base_vertex_offset = draw->index_bias; - else - cfg.base_vertex_offset = draw->start; + if (PAN_ARCH >= 9) { + /* Base vertex offset on Valhall is used for both + * indexed and non-indexed draws, in a simple way for + * either. Handle both cases. + */ + if (cfg.index_type) + cfg.base_vertex_offset = draw->index_bias; + else + cfg.base_vertex_offset = draw->start; - /* Indices are moved outside the primitive descriptor - * on Valhall, so we don't need to set that here - */ - } else if (cfg.index_type) { - cfg.base_vertex_offset = draw->index_bias - ctx->offset_start; + /* Indices are moved outside the primitive descriptor + * on Valhall, so we don't need to set that here + */ + } else if (cfg.index_type) { + cfg.base_vertex_offset = draw->index_bias - ctx->offset_start; #if PAN_ARCH <= 7 - cfg.indices = indices; + cfg.indices = indices; #endif - } + } #if PAN_ARCH >= 6 - cfg.secondary_shader = secondary_shader; + cfg.secondary_shader = secondary_shader; #endif - } + } } #if PAN_ARCH >= 9 static mali_ptr panfrost_upload_wa_sampler(struct panfrost_batch *batch) { - struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, SAMPLER); - pan_pack(T.cpu, SAMPLER, cfg); - return T.gpu; + struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, SAMPLER); + pan_pack(T.cpu, SAMPLER, cfg) + ; + return T.gpu; } static mali_ptr panfrost_emit_resources(struct panfrost_batch *batch, - enum pipe_shader_type stage, - mali_ptr ubos, unsigned ubo_count) + enum pipe_shader_type stage, mali_ptr ubos, + unsigned ubo_count) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_ptr T; - unsigned nr_tables = 12; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_ptr T; + unsigned nr_tables = 12; - /* Although individual resources need only 16 byte alignment, the - * resource table as a whole must be 64-byte aligned. - */ - T = pan_pool_alloc_aligned(&batch->pool.base, nr_tables * pan_size(RESOURCE), 64); - memset(T.cpu, 0, nr_tables * pan_size(RESOURCE)); + /* Although individual resources need only 16 byte alignment, the + * resource table as a whole must be 64-byte aligned. + */ + T = pan_pool_alloc_aligned(&batch->pool.base, nr_tables * pan_size(RESOURCE), + 64); + memset(T.cpu, 0, nr_tables * pan_size(RESOURCE)); - panfrost_make_resource_table(T, PAN_TABLE_UBO, ubos, ubo_count); + panfrost_make_resource_table(T, PAN_TABLE_UBO, ubos, ubo_count); - panfrost_make_resource_table(T, PAN_TABLE_TEXTURE, - batch->textures[stage], - ctx->sampler_view_count[stage]); + panfrost_make_resource_table(T, PAN_TABLE_TEXTURE, batch->textures[stage], + ctx->sampler_view_count[stage]); + if (ctx->sampler_count[stage]) { + panfrost_make_resource_table(T, PAN_TABLE_SAMPLER, batch->samplers[stage], + ctx->sampler_count[stage]); + } else { + /* We always need at least 1 sampler for txf to work */ + panfrost_make_resource_table(T, PAN_TABLE_SAMPLER, + panfrost_upload_wa_sampler(batch), 1); + } - if (ctx->sampler_count[stage]) { - panfrost_make_resource_table(T, PAN_TABLE_SAMPLER, - batch->samplers[stage], - ctx->sampler_count[stage]); - } else { - /* We always need at least 1 sampler for txf to work */ - panfrost_make_resource_table(T, PAN_TABLE_SAMPLER, - panfrost_upload_wa_sampler(batch), - 1); - } + panfrost_make_resource_table(T, PAN_TABLE_IMAGE, batch->images[stage], + util_last_bit(ctx->image_mask[stage])); - panfrost_make_resource_table(T, PAN_TABLE_IMAGE, - batch->images[stage], - util_last_bit(ctx->image_mask[stage])); + if (stage == PIPE_SHADER_VERTEX) { + panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE, + batch->attribs[stage], + ctx->vertex->num_elements); - if (stage == PIPE_SHADER_VERTEX) { - panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE, - batch->attribs[stage], - ctx->vertex->num_elements); + panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE_BUFFER, + batch->attrib_bufs[stage], + util_last_bit(ctx->vb_mask)); + } - panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE_BUFFER, - batch->attrib_bufs[stage], - util_last_bit(ctx->vb_mask)); - } - - return T.gpu | nr_tables; + return T.gpu | nr_tables; } static void panfrost_emit_shader(struct panfrost_batch *batch, struct MALI_SHADER_ENVIRONMENT *cfg, - enum pipe_shader_type stage, - mali_ptr shader_ptr, + enum pipe_shader_type stage, mali_ptr shader_ptr, mali_ptr thread_storage) { - unsigned fau_words = 0, ubo_count = 0; - mali_ptr ubos, resources; + unsigned fau_words = 0, ubo_count = 0; + mali_ptr ubos, resources; - ubos = panfrost_emit_const_buf(batch, stage, &ubo_count, &cfg->fau, - &fau_words); + ubos = + panfrost_emit_const_buf(batch, stage, &ubo_count, &cfg->fau, &fau_words); - resources = panfrost_emit_resources(batch, stage, ubos, ubo_count); + resources = panfrost_emit_resources(batch, stage, ubos, ubo_count); - cfg->thread_storage = thread_storage; - cfg->shader = shader_ptr; - cfg->resources = resources; + cfg->thread_storage = thread_storage; + cfg->shader = shader_ptr; + cfg->resources = resources; - /* Each entry of FAU is 64-bits */ - cfg->fau_count = DIV_ROUND_UP(fau_words, 2); + /* Each entry of FAU is 64-bits */ + cfg->fau_count = DIV_ROUND_UP(fau_words, 2); } #endif static void -panfrost_emit_draw(void *out, - struct panfrost_batch *batch, - bool fs_required, - enum pipe_prim_type prim, - mali_ptr pos, mali_ptr fs_vary, mali_ptr varyings) +panfrost_emit_draw(void *out, struct panfrost_batch *batch, bool fs_required, + enum pipe_prim_type prim, mali_ptr pos, mali_ptr fs_vary, + mali_ptr varyings) { - struct panfrost_context *ctx = batch->ctx; - struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; - bool polygon = (prim == PIPE_PRIM_TRIANGLES); + struct panfrost_context *ctx = batch->ctx; + struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + bool polygon = (prim == PIPE_PRIM_TRIANGLES); - pan_pack(out, DRAW, cfg) { - /* - * From the Gallium documentation, - * pipe_rasterizer_state::cull_face "indicates which faces of - * polygons to cull". Points and lines are not considered - * polygons and should be drawn even if all faces are culled. - * The hardware does not take primitive type into account when - * culling, so we need to do that check ourselves. - */ - cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT); - cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK); - cfg.front_face_ccw = rast->front_ccw; + pan_pack(out, DRAW, cfg) { + /* + * From the Gallium documentation, + * pipe_rasterizer_state::cull_face "indicates which faces of + * polygons to cull". Points and lines are not considered + * polygons and should be drawn even if all faces are culled. + * The hardware does not take primitive type into account when + * culling, so we need to do that check ourselves. + */ + cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT); + cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK); + cfg.front_face_ccw = rast->front_ccw; - if (ctx->occlusion_query && ctx->active_queries) { - if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER) - cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER; - else - cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE; + if (ctx->occlusion_query && ctx->active_queries) { + if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER) + cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER; + else + cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE; - struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc); - cfg.occlusion = rsrc->image.data.bo->ptr.gpu; - panfrost_batch_write_rsrc(ctx->batch, rsrc, - PIPE_SHADER_FRAGMENT); - } + struct panfrost_resource *rsrc = + pan_resource(ctx->occlusion_query->rsrc); + cfg.occlusion = rsrc->image.data.bo->ptr.gpu; + panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT); + } #if PAN_ARCH >= 9 - struct panfrost_compiled_shader *fs = - ctx->prog[PIPE_SHADER_FRAGMENT]; + struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; - cfg.multisample_enable = rast->multisample; - cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF; + cfg.multisample_enable = rast->multisample; + cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF; - /* Use per-sample shading if required by API Also use it when a - * blend shader is used with multisampling, as this is handled - * by a single ST_TILE in the blend shader with the current - * sample ID, requiring per-sample shading. - */ - cfg.evaluate_per_sample = - (rast->multisample && - ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader)); + /* Use per-sample shading if required by API Also use it when a + * blend shader is used with multisampling, as this is handled + * by a single ST_TILE in the blend shader with the current + * sample ID, requiring per-sample shading. + */ + cfg.evaluate_per_sample = + (rast->multisample && + ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader)); - cfg.single_sampled_lines = !rast->multisample; + cfg.single_sampled_lines = !rast->multisample; - cfg.vertex_array.packet = true; + cfg.vertex_array.packet = true; - cfg.minimum_z = batch->minimum_z; - cfg.maximum_z = batch->maximum_z; + cfg.minimum_z = batch->minimum_z; + cfg.maximum_z = batch->maximum_z; - cfg.depth_stencil = batch->depth_stencil; + cfg.depth_stencil = batch->depth_stencil; - if (fs_required) { - bool has_oq = ctx->occlusion_query && ctx->active_queries; + if (fs_required) { + bool has_oq = ctx->occlusion_query && ctx->active_queries; - struct pan_earlyzs_state earlyzs = - pan_earlyzs_get(fs->earlyzs, - ctx->depth_stencil->writes_zs || has_oq, - ctx->blend->base.alpha_to_coverage, - ctx->depth_stencil->zs_always_passes); + struct pan_earlyzs_state earlyzs = pan_earlyzs_get( + fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq, + ctx->blend->base.alpha_to_coverage, + ctx->depth_stencil->zs_always_passes); - cfg.pixel_kill_operation = earlyzs.kill; - cfg.zs_update_operation = earlyzs.update; + cfg.pixel_kill_operation = earlyzs.kill; + cfg.zs_update_operation = earlyzs.update; - cfg.allow_forward_pixel_to_kill = pan_allow_forward_pixel_to_kill(ctx, fs); - cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global; + cfg.allow_forward_pixel_to_kill = + pan_allow_forward_pixel_to_kill(ctx, fs); + cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global; - /* Mask of render targets that may be written. A render - * target may be written if the fragment shader writes - * to it AND it actually exists. If the render target - * doesn't actually exist, the blend descriptor will be - * OFF so it may be omitted from the mask. - * - * Only set when there is a fragment shader, since - * otherwise no colour updates are possible. - */ - cfg.render_target_mask = - (fs->info.outputs_written >> FRAG_RESULT_DATA0) & - ctx->fb_rt_mask; + /* Mask of render targets that may be written. A render + * target may be written if the fragment shader writes + * to it AND it actually exists. If the render target + * doesn't actually exist, the blend descriptor will be + * OFF so it may be omitted from the mask. + * + * Only set when there is a fragment shader, since + * otherwise no colour updates are possible. + */ + cfg.render_target_mask = + (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask; - /* Also use per-sample shading if required by the shader - */ - cfg.evaluate_per_sample |= fs->info.fs.sample_shading; + /* Also use per-sample shading if required by the shader + */ + cfg.evaluate_per_sample |= fs->info.fs.sample_shading; - /* Unlike Bifrost, alpha-to-coverage must be included in - * this identically-named flag. Confusing, isn't it? - */ - cfg.shader_modifies_coverage = fs->info.fs.writes_coverage || - fs->info.fs.can_discard || - ctx->blend->base.alpha_to_coverage; + /* Unlike Bifrost, alpha-to-coverage must be included in + * this identically-named flag. Confusing, isn't it? + */ + cfg.shader_modifies_coverage = fs->info.fs.writes_coverage || + fs->info.fs.can_discard || + ctx->blend->base.alpha_to_coverage; - /* Blend descriptors are only accessed by a BLEND - * instruction on Valhall. It follows that if the - * fragment shader is omitted, we may also emit the - * blend descriptors. - */ - cfg.blend = batch->blend; - cfg.blend_count = MAX2(batch->key.nr_cbufs, 1); - cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage; + /* Blend descriptors are only accessed by a BLEND + * instruction on Valhall. It follows that if the + * fragment shader is omitted, we may also emit the + * blend descriptors. + */ + cfg.blend = batch->blend; + cfg.blend_count = MAX2(batch->key.nr_cbufs, 1); + cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage; - cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0); - cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1); + cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0); + cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1); - panfrost_emit_shader(batch, &cfg.shader, PIPE_SHADER_FRAGMENT, - batch->rsd[PIPE_SHADER_FRAGMENT], - batch->tls.gpu); - } else { - /* These operations need to be FORCE to benefit from the - * depth-only pass optimizations. - */ - cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; - cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY; + panfrost_emit_shader(batch, &cfg.shader, PIPE_SHADER_FRAGMENT, + batch->rsd[PIPE_SHADER_FRAGMENT], batch->tls.gpu); + } else { + /* These operations need to be FORCE to benefit from the + * depth-only pass optimizations. + */ + cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; + cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY; - /* No shader and no blend => no shader or blend - * reasons to disable FPK. The only FPK-related state - * not covered is alpha-to-coverage which we don't set - * without blend. - */ - cfg.allow_forward_pixel_to_kill = true; + /* No shader and no blend => no shader or blend + * reasons to disable FPK. The only FPK-related state + * not covered is alpha-to-coverage which we don't set + * without blend. + */ + cfg.allow_forward_pixel_to_kill = true; - /* No shader => no shader side effects */ - cfg.allow_forward_pixel_to_be_killed = true; + /* No shader => no shader side effects */ + cfg.allow_forward_pixel_to_be_killed = true; - /* Alpha isn't written so these are vacuous */ - cfg.overdraw_alpha0 = true; - cfg.overdraw_alpha1 = true; - } + /* Alpha isn't written so these are vacuous */ + cfg.overdraw_alpha0 = true; + cfg.overdraw_alpha1 = true; + } #else - cfg.position = pos; - cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT]; - cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT]; - cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT]; - cfg.viewport = batch->viewport; - cfg.varyings = fs_vary; - cfg.varying_buffers = fs_vary ? varyings : 0; - cfg.thread_storage = batch->tls.gpu; + cfg.position = pos; + cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT]; + cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT]; + cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT]; + cfg.viewport = batch->viewport; + cfg.varyings = fs_vary; + cfg.varying_buffers = fs_vary ? varyings : 0; + cfg.thread_storage = batch->tls.gpu; - /* For all primitives but lines DRAW.flat_shading_vertex must - * be set to 0 and the provoking vertex is selected with the - * PRIMITIVE.first_provoking_vertex field. - */ - if (prim == PIPE_PRIM_LINES) { - /* The logic is inverted across arches. */ - cfg.flat_shading_vertex = rast->flatshade_first - ^ (PAN_ARCH <= 5); - } + /* For all primitives but lines DRAW.flat_shading_vertex must + * be set to 0 and the provoking vertex is selected with the + * PRIMITIVE.first_provoking_vertex field. + */ + if (prim == PIPE_PRIM_LINES) { + /* The logic is inverted across arches. */ + cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5); + } - pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT); + pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT); #endif - } + } } #if PAN_ARCH >= 9 @@ -3353,90 +3278,90 @@ static void panfrost_emit_malloc_vertex(struct panfrost_batch *batch, const struct pipe_draw_info *info, const struct pipe_draw_start_count_bias *draw, - mali_ptr indices, bool secondary_shader, - void *job) + mali_ptr indices, bool secondary_shader, void *job) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; - struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; + struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT]; - bool fs_required = panfrost_fs_required(fs, ctx->blend, - &ctx->pipe_framebuffer, - ctx->depth_stencil); + bool fs_required = panfrost_fs_required( + fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil); - /* Varying shaders only feed data to the fragment shader, so if we omit - * the fragment shader, we should omit the varying shader too. - */ - secondary_shader &= fs_required; + /* Varying shaders only feed data to the fragment shader, so if we omit + * the fragment shader, we should omit the varying shader too. + */ + secondary_shader &= fs_required; - panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader, - pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE)); + panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader, + pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE)); - pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) { - cfg.count = info->instance_count; - } + pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) { + cfg.count = info->instance_count; + } - pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) { - if (secondary_shader) { - unsigned v = vs->info.varyings.output_count; - unsigned f = fs->info.varyings.input_count; - unsigned slots = MAX2(v, f); - slots += util_bitcount(fs->key.fs.fixed_varying_mask); - unsigned size = slots * 16; + pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) { + if (secondary_shader) { + unsigned v = vs->info.varyings.output_count; + unsigned f = fs->info.varyings.input_count; + unsigned slots = MAX2(v, f); + slots += util_bitcount(fs->key.fs.fixed_varying_mask); + unsigned size = slots * 16; - /* Assumes 16 byte slots. We could do better. */ - cfg.vertex_packet_stride = size + 16; - cfg.vertex_attribute_stride = size; - } else { - /* Hardware requirement for "no varyings" */ - cfg.vertex_packet_stride = 16; - cfg.vertex_attribute_stride = 0; - } - } + /* Assumes 16 byte slots. We could do better. */ + cfg.vertex_packet_stride = size + 16; + cfg.vertex_attribute_stride = size; + } else { + /* Hardware requirement for "no varyings" */ + cfg.vertex_packet_stride = 16; + cfg.vertex_attribute_stride = 0; + } + } - pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) { - cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0); - } + pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) { + cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0); + } - STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR)); - memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), - &batch->scissor, pan_size(SCISSOR)); + STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR)); + memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor, + pan_size(SCISSOR)); - panfrost_emit_primitive_size(ctx, info->mode == PIPE_PRIM_POINTS, 0, - pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE)); + panfrost_emit_primitive_size( + ctx, info->mode == PIPE_PRIM_POINTS, 0, + pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE)); - pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) { - cfg.address = indices; - } + pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) { + cfg.address = indices; + } - panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), - batch, fs_required, u_reduced_prim(info->mode), 0, 0, 0); + panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch, + fs_required, u_reduced_prim(info->mode), 0, 0, 0); - pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) { - /* IDVS/points vertex shader */ - mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX]; + pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) { + /* IDVS/points vertex shader */ + mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX]; - /* IDVS/triangle vertex shader */ - if (vs_ptr && info->mode != PIPE_PRIM_POINTS) - vs_ptr += pan_size(SHADER_PROGRAM); + /* IDVS/triangle vertex shader */ + if (vs_ptr && info->mode != PIPE_PRIM_POINTS) + vs_ptr += pan_size(SHADER_PROGRAM); - panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr, - batch->tls.gpu); - } + panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr, + batch->tls.gpu); + } - pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) { - /* If a varying shader is used, we configure it with the same - * state as the position shader for backwards compatible - * behaviour with Bifrost. This could be optimized. - */ - if (!secondary_shader) continue; + pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) { + /* If a varying shader is used, we configure it with the same + * state as the position shader for backwards compatible + * behaviour with Bifrost. This could be optimized. + */ + if (!secondary_shader) + continue; - mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] + - (2 * pan_size(SHADER_PROGRAM)); + mali_ptr ptr = + batch->rsd[PIPE_SHADER_VERTEX] + (2 * pan_size(SHADER_PROGRAM)); - panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, - ptr, batch->tls.gpu); - } + panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, ptr, + batch->tls.gpu); + } } #endif @@ -3445,391 +3370,381 @@ static void panfrost_draw_emit_tiler(struct panfrost_batch *batch, const struct pipe_draw_info *info, const struct pipe_draw_start_count_bias *draw, - void *invocation_template, - mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings, - mali_ptr pos, mali_ptr psiz, bool secondary_shader, - void *job) + void *invocation_template, mali_ptr indices, + mali_ptr fs_vary, mali_ptr varyings, mali_ptr pos, + mali_ptr psiz, bool secondary_shader, void *job) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - void *section = pan_section_ptr(job, TILER_JOB, INVOCATION); - memcpy(section, invocation_template, pan_size(INVOCATION)); + void *section = pan_section_ptr(job, TILER_JOB, INVOCATION); + memcpy(section, invocation_template, pan_size(INVOCATION)); - panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader, - pan_section_ptr(job, TILER_JOB, PRIMITIVE)); + panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader, + pan_section_ptr(job, TILER_JOB, PRIMITIVE)); - void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE); - enum pipe_prim_type prim = u_reduced_prim(info->mode); + void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE); + enum pipe_prim_type prim = u_reduced_prim(info->mode); #if PAN_ARCH >= 6 - pan_section_pack(job, TILER_JOB, TILER, cfg) { - cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0); - } + pan_section_pack(job, TILER_JOB, TILER, cfg) { + cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0); + } - pan_section_pack(job, TILER_JOB, PADDING, cfg); + pan_section_pack(job, TILER_JOB, PADDING, cfg) + ; #endif - panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW), - batch, true, prim, pos, fs_vary, varyings); + panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim, + pos, fs_vary, varyings); - panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size); + panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size); } #endif static void panfrost_launch_xfb(struct panfrost_batch *batch, - const struct pipe_draw_info *info, - mali_ptr attribs, mali_ptr attrib_bufs, - unsigned count) + const struct pipe_draw_info *info, mali_ptr attribs, + mali_ptr attrib_bufs, unsigned count) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - struct panfrost_ptr t = - pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); + struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); - /* Nothing to do */ - if (batch->ctx->streamout.num_targets == 0) - return; + /* Nothing to do */ + if (batch->ctx->streamout.num_targets == 0) + return; - /* TODO: XFB with index buffers */ - //assert(info->index_size == 0); - u_trim_pipe_prim(info->mode, &count); + /* TODO: XFB with index buffers */ + // assert(info->index_size == 0); + u_trim_pipe_prim(info->mode, &count); - if (count == 0) - return; + if (count == 0) + return; - perf_debug_ctx(batch->ctx, "Emulating transform feedback"); + perf_debug_ctx(batch->ctx, "Emulating transform feedback"); - struct panfrost_uncompiled_shader *vs_uncompiled = ctx->uncompiled[PIPE_SHADER_VERTEX]; - struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; + struct panfrost_uncompiled_shader *vs_uncompiled = + ctx->uncompiled[PIPE_SHADER_VERTEX]; + struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; - vs_uncompiled->xfb->stream_output = vs->stream_output; + vs_uncompiled->xfb->stream_output = vs->stream_output; - mali_ptr saved_rsd = batch->rsd[PIPE_SHADER_VERTEX]; - mali_ptr saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX]; - mali_ptr saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX]; + mali_ptr saved_rsd = batch->rsd[PIPE_SHADER_VERTEX]; + mali_ptr saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX]; + mali_ptr saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX]; - ctx->uncompiled[PIPE_SHADER_VERTEX] = NULL; /* should not be read */ - ctx->prog[PIPE_SHADER_VERTEX] = vs_uncompiled->xfb; - batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX); + ctx->uncompiled[PIPE_SHADER_VERTEX] = NULL; /* should not be read */ + ctx->prog[PIPE_SHADER_VERTEX] = vs_uncompiled->xfb; + batch->rsd[PIPE_SHADER_VERTEX] = + panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX); #if PAN_ARCH >= 9 - pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { - cfg.workgroup_size_x = 1; - cfg.workgroup_size_y = 1; - cfg.workgroup_size_z = 1; + pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { + cfg.workgroup_size_x = 1; + cfg.workgroup_size_y = 1; + cfg.workgroup_size_z = 1; - cfg.workgroup_count_x = count; - cfg.workgroup_count_y = info->instance_count; - cfg.workgroup_count_z = 1; + cfg.workgroup_count_x = count; + cfg.workgroup_count_y = info->instance_count; + cfg.workgroup_count_z = 1; - panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_VERTEX, - batch->rsd[PIPE_SHADER_VERTEX], - batch->tls.gpu); + panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_VERTEX, + batch->rsd[PIPE_SHADER_VERTEX], batch->tls.gpu); - /* TODO: Indexing. Also, this is a legacy feature... */ - cfg.compute.attribute_offset = batch->ctx->offset_start; + /* TODO: Indexing. Also, this is a legacy feature... */ + cfg.compute.attribute_offset = batch->ctx->offset_start; - /* Transform feedback shaders do not use barriers or shared - * memory, so we may merge workgroups. - */ - cfg.allow_merging_workgroups = true; - cfg.task_increment = 1; - cfg.task_axis = MALI_TASK_AXIS_Z; - } + /* Transform feedback shaders do not use barriers or shared + * memory, so we may merge workgroups. + */ + cfg.allow_merging_workgroups = true; + cfg.task_increment = 1; + cfg.task_axis = MALI_TASK_AXIS_Z; + } #else - struct mali_invocation_packed invocation; + struct mali_invocation_packed invocation; - panfrost_pack_work_groups_compute(&invocation, - 1, count, info->instance_count, - 1, 1, 1, PAN_ARCH <= 5, false); + panfrost_pack_work_groups_compute(&invocation, 1, count, + info->instance_count, 1, 1, 1, + PAN_ARCH <= 5, false); - batch->uniform_buffers[PIPE_SHADER_VERTEX] = - panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL, - &batch->push_uniforms[PIPE_SHADER_VERTEX], NULL); + batch->uniform_buffers[PIPE_SHADER_VERTEX] = + panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL, + &batch->push_uniforms[PIPE_SHADER_VERTEX], NULL); - panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0, - attribs, attrib_bufs, t.cpu); + panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0, attribs, + attrib_bufs, t.cpu); #endif - enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE; + enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE; #if PAN_ARCH <= 5 - job_type = MALI_JOB_TYPE_VERTEX; + job_type = MALI_JOB_TYPE_VERTEX; #endif - panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type, - true, false, 0, 0, &t, false); + panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type, true, + false, 0, 0, &t, false); - ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled; - ctx->prog[PIPE_SHADER_VERTEX] = vs; - batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd; - batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo; - batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push; + ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled; + ctx->prog[PIPE_SHADER_VERTEX] = vs; + batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd; + batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo; + batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push; } static void panfrost_direct_draw(struct panfrost_batch *batch, - const struct pipe_draw_info *info, - unsigned drawid_offset, + const struct pipe_draw_info *info, unsigned drawid_offset, const struct pipe_draw_start_count_bias *draw) { - if (!draw->count || !info->instance_count) - return; + if (!draw->count || !info->instance_count) + return; - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - /* If we change whether we're drawing points, or whether point sprites - * are enabled (specified in the rasterizer), we may need to rebind - * shaders accordingly. This implicitly covers the case of rebinding - * framebuffers, because all dirty flags are set there. - */ - if ((ctx->dirty & PAN_DIRTY_RASTERIZER) || - ((ctx->active_prim == PIPE_PRIM_POINTS) ^ - (info->mode == PIPE_PRIM_POINTS))) { + /* If we change whether we're drawing points, or whether point sprites + * are enabled (specified in the rasterizer), we may need to rebind + * shaders accordingly. This implicitly covers the case of rebinding + * framebuffers, because all dirty flags are set there. + */ + if ((ctx->dirty & PAN_DIRTY_RASTERIZER) || + ((ctx->active_prim == PIPE_PRIM_POINTS) ^ + (info->mode == PIPE_PRIM_POINTS))) { - ctx->active_prim = info->mode; - panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT); - } + ctx->active_prim = info->mode; + panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT); + } - /* Take into account a negative bias */ - ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0); - ctx->instance_count = info->instance_count; - ctx->base_vertex = info->index_size ? draw->index_bias : 0; - ctx->base_instance = info->start_instance; - ctx->active_prim = info->mode; - ctx->drawid = drawid_offset; + /* Take into account a negative bias */ + ctx->vertex_count = + draw->count + (info->index_size ? abs(draw->index_bias) : 0); + ctx->instance_count = info->instance_count; + ctx->base_vertex = info->index_size ? draw->index_bias : 0; + ctx->base_instance = info->start_instance; + ctx->active_prim = info->mode; + ctx->drawid = drawid_offset; - struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; + struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; - bool idvs = vs->info.vs.idvs; - bool secondary_shader = vs->info.vs.secondary_enable; + bool idvs = vs->info.vs.idvs; + bool secondary_shader = vs->info.vs.secondary_enable; - UNUSED struct panfrost_ptr tiler, vertex; + UNUSED struct panfrost_ptr tiler, vertex; - if (idvs) { + if (idvs) { #if PAN_ARCH >= 9 - tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB); + tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB); #elif PAN_ARCH >= 6 - tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB); + tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB); #else - unreachable("IDVS is unsupported on Midgard"); + unreachable("IDVS is unsupported on Midgard"); #endif - } else { - vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); - tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB); - } + } else { + vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); + tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB); + } - unsigned vertex_count = ctx->vertex_count; + unsigned vertex_count = ctx->vertex_count; - unsigned min_index = 0, max_index = 0; - mali_ptr indices = 0; + unsigned min_index = 0, max_index = 0; + mali_ptr indices = 0; - if (info->index_size && PAN_ARCH >= 9) { - indices = panfrost_get_index_buffer(batch, info, draw); - } else if (info->index_size) { - indices = panfrost_get_index_buffer_bounded(batch, info, draw, - &min_index, - &max_index); + if (info->index_size && PAN_ARCH >= 9) { + indices = panfrost_get_index_buffer(batch, info, draw); + } else if (info->index_size) { + indices = panfrost_get_index_buffer_bounded(batch, info, draw, &min_index, + &max_index); - /* Use the corresponding values */ - vertex_count = max_index - min_index + 1; - ctx->offset_start = min_index + draw->index_bias; - } else { - ctx->offset_start = draw->start; - } + /* Use the corresponding values */ + vertex_count = max_index - min_index + 1; + ctx->offset_start = min_index + draw->index_bias; + } else { + ctx->offset_start = draw->start; + } - if (info->instance_count > 1) { - unsigned count = vertex_count; + if (info->instance_count > 1) { + unsigned count = vertex_count; - /* Index-Driven Vertex Shading requires different instances to - * have different cache lines for position results. Each vertex - * position is 16 bytes and the Mali cache line is 64 bytes, so - * the instance count must be aligned to 4 vertices. - */ - if (idvs) - count = ALIGN_POT(count, 4); + /* Index-Driven Vertex Shading requires different instances to + * have different cache lines for position results. Each vertex + * position is 16 bytes and the Mali cache line is 64 bytes, so + * the instance count must be aligned to 4 vertices. + */ + if (idvs) + count = ALIGN_POT(count, 4); - ctx->padded_count = panfrost_padded_vertex_count(count); - } else - ctx->padded_count = vertex_count; + ctx->padded_count = panfrost_padded_vertex_count(count); + } else + ctx->padded_count = vertex_count; - panfrost_statistics_record(ctx, info, draw); + panfrost_statistics_record(ctx, info, draw); #if PAN_ARCH <= 7 - struct mali_invocation_packed invocation; - if (info->instance_count > 1) { - panfrost_pack_work_groups_compute(&invocation, - 1, vertex_count, info->instance_count, - 1, 1, 1, true, false); - } else { - pan_pack(&invocation, INVOCATION, cfg) { - cfg.invocations = MALI_POSITIVE(vertex_count); - cfg.size_y_shift = 0; - cfg.size_z_shift = 0; - cfg.workgroups_x_shift = 0; - cfg.workgroups_y_shift = 0; - cfg.workgroups_z_shift = 32; - cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT; - } - } + struct mali_invocation_packed invocation; + if (info->instance_count > 1) { + panfrost_pack_work_groups_compute(&invocation, 1, vertex_count, + info->instance_count, 1, 1, 1, true, + false); + } else { + pan_pack(&invocation, INVOCATION, cfg) { + cfg.invocations = MALI_POSITIVE(vertex_count); + cfg.size_y_shift = 0; + cfg.size_z_shift = 0; + cfg.workgroups_x_shift = 0; + cfg.workgroups_y_shift = 0; + cfg.workgroups_z_shift = 32; + cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT; + } + } - /* Emit all sort of descriptors. */ - mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0; + /* Emit all sort of descriptors. */ + mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0; - panfrost_emit_varying_descriptor(batch, - ctx->padded_count * - ctx->instance_count, - &vs_vary, &fs_vary, &varyings, - NULL, &pos, &psiz, - info->mode == PIPE_PRIM_POINTS); + panfrost_emit_varying_descriptor( + batch, ctx->padded_count * ctx->instance_count, &vs_vary, &fs_vary, + &varyings, NULL, &pos, &psiz, info->mode == PIPE_PRIM_POINTS); - mali_ptr attribs, attrib_bufs; - attribs = panfrost_emit_vertex_data(batch, &attrib_bufs); + mali_ptr attribs, attrib_bufs; + attribs = panfrost_emit_vertex_data(batch, &attrib_bufs); #endif - panfrost_update_state_3d(batch); - panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX); - panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT); - panfrost_clean_state_3d(ctx); + panfrost_update_state_3d(batch); + panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX); + panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT); + panfrost_clean_state_3d(ctx); - if (ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb) { + if (ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb) { #if PAN_ARCH >= 9 - mali_ptr attribs = 0, attrib_bufs = 0; + mali_ptr attribs = 0, attrib_bufs = 0; #endif - panfrost_launch_xfb(batch, info, attribs, attrib_bufs, draw->count); - } + panfrost_launch_xfb(batch, info, attribs, attrib_bufs, draw->count); + } - /* Increment transform feedback offsets */ - panfrost_update_streamout_offsets(ctx); + /* Increment transform feedback offsets */ + panfrost_update_streamout_offsets(ctx); - /* Any side effects must be handled by the XFB shader, so we only need - * to run vertex shaders if we need rasterization. - */ - if (panfrost_batch_skip_rasterization(batch)) - return; + /* Any side effects must be handled by the XFB shader, so we only need + * to run vertex shaders if we need rasterization. + */ + if (panfrost_batch_skip_rasterization(batch)) + return; #if PAN_ARCH >= 9 - assert(idvs && "Memory allocated IDVS required on Valhall"); + assert(idvs && "Memory allocated IDVS required on Valhall"); - panfrost_emit_malloc_vertex(batch, info, draw, indices, - secondary_shader, tiler.cpu); + panfrost_emit_malloc_vertex(batch, info, draw, indices, secondary_shader, + tiler.cpu); - panfrost_add_job(&batch->pool.base, &batch->scoreboard, - MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0, - 0, &tiler, false); + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0, 0, &tiler, + false); #else - /* Fire off the draw itself */ - panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices, - fs_vary, varyings, pos, psiz, secondary_shader, - tiler.cpu); - if (idvs) { + /* Fire off the draw itself */ + panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices, fs_vary, + varyings, pos, psiz, secondary_shader, tiler.cpu); + if (idvs) { #if PAN_ARCH >= 6 - panfrost_draw_emit_vertex_section(batch, - vs_vary, varyings, - attribs, attrib_bufs, - pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW)); + panfrost_draw_emit_vertex_section( + batch, vs_vary, varyings, attribs, attrib_bufs, + pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW)); - panfrost_add_job(&batch->pool.base, &batch->scoreboard, - MALI_JOB_TYPE_INDEXED_VERTEX, false, false, - 0, 0, &tiler, false); + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0, 0, &tiler, + false); #endif - } else { - panfrost_draw_emit_vertex(batch, info, &invocation, - vs_vary, varyings, attribs, attrib_bufs, vertex.cpu); - panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler); - } + } else { + panfrost_draw_emit_vertex(batch, info, &invocation, vs_vary, varyings, + attribs, attrib_bufs, vertex.cpu); + panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler); + } #endif } static bool -panfrost_compatible_batch_state(struct panfrost_batch *batch, - bool points) +panfrost_compatible_batch_state(struct panfrost_batch *batch, bool points) { - /* Only applies on Valhall */ - if (PAN_ARCH < 9) - return true; + /* Only applies on Valhall */ + if (PAN_ARCH < 9) + return true; - struct panfrost_context *ctx = batch->ctx; - struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; + struct panfrost_context *ctx = batch->ctx; + struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; - bool coord = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT); - bool first = rast->flatshade_first; + bool coord = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT); + bool first = rast->flatshade_first; - /* gl_PointCoord orientation only matters when drawing points, but - * provoking vertex doesn't matter for points. - */ - if (points) - return pan_tristate_set(&batch->sprite_coord_origin, coord); - else - return pan_tristate_set(&batch->first_provoking_vertex, first); + /* gl_PointCoord orientation only matters when drawing points, but + * provoking vertex doesn't matter for points. + */ + if (points) + return pan_tristate_set(&batch->sprite_coord_origin, coord); + else + return pan_tristate_set(&batch->first_provoking_vertex, first); } static void -panfrost_draw_vbo(struct pipe_context *pipe, - const struct pipe_draw_info *info, +panfrost_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info, unsigned drawid_offset, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count_bias *draws, unsigned num_draws) { - struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_device *dev = pan_device(pipe->screen); + struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_device *dev = pan_device(pipe->screen); - if (!panfrost_render_condition_check(ctx)) - return; + if (!panfrost_render_condition_check(ctx)) + return; - ctx->draw_calls++; + ctx->draw_calls++; - /* Emulate indirect draws on JM */ - if (indirect && indirect->buffer) { - assert(num_draws == 1); - util_draw_indirect(pipe, info, indirect); - perf_debug(dev, "Emulating indirect draw on the CPU"); - return; - } + /* Emulate indirect draws on JM */ + if (indirect && indirect->buffer) { + assert(num_draws == 1); + util_draw_indirect(pipe, info, indirect); + perf_debug(dev, "Emulating indirect draw on the CPU"); + return; + } - /* Do some common setup */ - struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + /* Do some common setup */ + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - /* Don't add too many jobs to a single batch. Hardware has a hard limit - * of 65536 jobs, but we choose a smaller soft limit (arbitrary) to - * avoid the risk of timeouts. This might not be a good idea. */ - if (unlikely(batch->scoreboard.job_index > 10000)) - batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws"); + /* Don't add too many jobs to a single batch. Hardware has a hard limit + * of 65536 jobs, but we choose a smaller soft limit (arbitrary) to + * avoid the risk of timeouts. This might not be a good idea. */ + if (unlikely(batch->scoreboard.job_index > 10000)) + batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws"); - bool points = (info->mode == PIPE_PRIM_POINTS); + bool points = (info->mode == PIPE_PRIM_POINTS); - if (unlikely(!panfrost_compatible_batch_state(batch, points))) { - batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change"); + if (unlikely(!panfrost_compatible_batch_state(batch, points))) { + batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change"); - ASSERTED bool succ = panfrost_compatible_batch_state(batch, points); - assert(succ && "must be able to set state for a fresh batch"); - } + ASSERTED bool succ = panfrost_compatible_batch_state(batch, points); + assert(succ && "must be able to set state for a fresh batch"); + } - /* panfrost_batch_skip_rasterization reads - * batch->scissor_culls_everything, which is set by - * panfrost_emit_viewport, so call that first. - */ - if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR)) - batch->viewport = panfrost_emit_viewport(batch); + /* panfrost_batch_skip_rasterization reads + * batch->scissor_culls_everything, which is set by + * panfrost_emit_viewport, so call that first. + */ + if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR)) + batch->viewport = panfrost_emit_viewport(batch); - /* Mark everything dirty when debugging */ - if (unlikely(dev->debug & PAN_DBG_DIRTY)) - panfrost_dirty_state_all(ctx); + /* Mark everything dirty when debugging */ + if (unlikely(dev->debug & PAN_DBG_DIRTY)) + panfrost_dirty_state_all(ctx); - /* Conservatively assume draw parameters always change */ - ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID; + /* Conservatively assume draw parameters always change */ + ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID; - struct pipe_draw_info tmp_info = *info; - unsigned drawid = drawid_offset; + struct pipe_draw_info tmp_info = *info; + unsigned drawid = drawid_offset; - for (unsigned i = 0; i < num_draws; i++) { - panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]); - - if (tmp_info.increment_draw_id) { - ctx->dirty |= PAN_DIRTY_DRAWID; - drawid++; - } - } + for (unsigned i = 0; i < num_draws; i++) { + panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]); + if (tmp_info.increment_draw_id) { + ctx->dirty |= PAN_DIRTY_DRAWID; + drawid++; + } + } } /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we @@ -3838,162 +3753,156 @@ panfrost_draw_vbo(struct pipe_context *pipe, static void panfrost_launch_grid(struct pipe_context *pipe, - const struct pipe_grid_info *info) + const struct pipe_grid_info *info) { - struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_context *ctx = pan_context(pipe); - /* XXX - shouldn't be necessary with working memory barriers. Affected - * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */ - panfrost_flush_all_batches(ctx, "Launch grid pre-barrier"); + /* XXX - shouldn't be necessary with working memory barriers. Affected + * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */ + panfrost_flush_all_batches(ctx, "Launch grid pre-barrier"); - struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - if (info->indirect && !PAN_GPU_INDIRECTS) { - struct pipe_transfer *transfer; - uint32_t *params = pipe_buffer_map_range(pipe, info->indirect, - info->indirect_offset, - 3 * sizeof(uint32_t), - PIPE_MAP_READ, - &transfer); + if (info->indirect && !PAN_GPU_INDIRECTS) { + struct pipe_transfer *transfer; + uint32_t *params = + pipe_buffer_map_range(pipe, info->indirect, info->indirect_offset, + 3 * sizeof(uint32_t), PIPE_MAP_READ, &transfer); - struct pipe_grid_info direct = *info; - direct.indirect = NULL; - direct.grid[0] = params[0]; - direct.grid[1] = params[1]; - direct.grid[2] = params[2]; - pipe_buffer_unmap(pipe, transfer); + struct pipe_grid_info direct = *info; + direct.indirect = NULL; + direct.grid[0] = params[0]; + direct.grid[1] = params[1]; + direct.grid[2] = params[2]; + pipe_buffer_unmap(pipe, transfer); - if (params[0] && params[1] && params[2]) - panfrost_launch_grid(pipe, &direct); + if (params[0] && params[1] && params[2]) + panfrost_launch_grid(pipe, &direct); - return; - } + return; + } - ctx->compute_grid = info; + ctx->compute_grid = info; - struct panfrost_ptr t = - pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); + struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); - /* Invoke according to the grid info */ + /* Invoke according to the grid info */ - unsigned num_wg[3] = { info->grid[0], info->grid[1], info->grid[2] }; + unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]}; - if (info->indirect) - num_wg[0] = num_wg[1] = num_wg[2] = 1; + if (info->indirect) + num_wg[0] = num_wg[1] = num_wg[2] = 1; - /* Conservatively assume workgroup size changes every launch */ - ctx->dirty |= PAN_DIRTY_PARAMS; + /* Conservatively assume workgroup size changes every launch */ + ctx->dirty |= PAN_DIRTY_PARAMS; - panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE); + panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE); #if PAN_ARCH <= 7 - panfrost_pack_work_groups_compute(pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), - num_wg[0], num_wg[1], num_wg[2], - info->block[0], info->block[1], - info->block[2], - false, info->indirect != NULL); + panfrost_pack_work_groups_compute( + pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1], + num_wg[2], info->block[0], info->block[1], info->block[2], false, + info->indirect != NULL); - pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) { - cfg.job_task_split = - util_logbase2_ceil(info->block[0] + 1) + - util_logbase2_ceil(info->block[1] + 1) + - util_logbase2_ceil(info->block[2] + 1); - } + pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) { + cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) + + util_logbase2_ceil(info->block[1] + 1) + + util_logbase2_ceil(info->block[2] + 1); + } - pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) { - cfg.state = batch->rsd[PIPE_SHADER_COMPUTE]; - cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE); - cfg.thread_storage = panfrost_emit_shared_memory(batch, info); - cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE]; - cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE]; - cfg.textures = batch->textures[PIPE_SHADER_COMPUTE]; - cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE]; - } + pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) { + cfg.state = batch->rsd[PIPE_SHADER_COMPUTE]; + cfg.attributes = panfrost_emit_image_attribs( + batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE); + cfg.thread_storage = panfrost_emit_shared_memory(batch, info); + cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE]; + cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE]; + cfg.textures = batch->textures[PIPE_SHADER_COMPUTE]; + cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE]; + } #else - struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE]; + struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE]; - pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { - cfg.workgroup_size_x = info->block[0]; - cfg.workgroup_size_y = info->block[1]; - cfg.workgroup_size_z = info->block[2]; + pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { + cfg.workgroup_size_x = info->block[0]; + cfg.workgroup_size_y = info->block[1]; + cfg.workgroup_size_z = info->block[2]; - cfg.workgroup_count_x = num_wg[0]; - cfg.workgroup_count_y = num_wg[1]; - cfg.workgroup_count_z = num_wg[2]; + cfg.workgroup_count_x = num_wg[0]; + cfg.workgroup_count_y = num_wg[1]; + cfg.workgroup_count_z = num_wg[2]; - panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_COMPUTE, - batch->rsd[PIPE_SHADER_COMPUTE], - panfrost_emit_shared_memory(batch, info)); + panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_COMPUTE, + batch->rsd[PIPE_SHADER_COMPUTE], + panfrost_emit_shared_memory(batch, info)); - /* Workgroups may be merged if the shader does not use barriers - * or shared memory. This condition is checked against the - * static shared_size at compile-time. We need to check the - * variable shared size at launch_grid time, because the - * compiler doesn't know about that. - */ - cfg.allow_merging_workgroups = - cs->info.cs.allow_merging_workgroups && - (info->variable_shared_mem == 0); + /* Workgroups may be merged if the shader does not use barriers + * or shared memory. This condition is checked against the + * static shared_size at compile-time. We need to check the + * variable shared size at launch_grid time, because the + * compiler doesn't know about that. + */ + cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups && + (info->variable_shared_mem == 0); - cfg.task_increment = 1; - cfg.task_axis = MALI_TASK_AXIS_Z; - } + cfg.task_increment = 1; + cfg.task_axis = MALI_TASK_AXIS_Z; + } #endif - unsigned indirect_dep = 0; + unsigned indirect_dep = 0; #if PAN_GPU_INDIRECTS - if (info->indirect) { - struct pan_indirect_dispatch_info indirect = { - .job = t.gpu, - .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu + - info->indirect_offset, - .num_wg_sysval = { - batch->num_wg_sysval[0], - batch->num_wg_sysval[1], - batch->num_wg_sysval[2], - }, - }; + if (info->indirect) { + struct pan_indirect_dispatch_info indirect = { + .job = t.gpu, + .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu + + info->indirect_offset, + .num_wg_sysval = + { + batch->num_wg_sysval[0], + batch->num_wg_sysval[1], + batch->num_wg_sysval[2], + }, + }; - indirect_dep = GENX(pan_indirect_dispatch_emit)(&batch->pool.base, - &batch->scoreboard, - &indirect); - } + indirect_dep = GENX(pan_indirect_dispatch_emit)( + &batch->pool.base, &batch->scoreboard, &indirect); + } #endif - panfrost_add_job(&batch->pool.base, &batch->scoreboard, - MALI_JOB_TYPE_COMPUTE, true, false, - indirect_dep, 0, &t, false); - panfrost_flush_all_batches(ctx, "Launch grid post-barrier"); + panfrost_add_job(&batch->pool.base, &batch->scoreboard, + MALI_JOB_TYPE_COMPUTE, true, false, indirect_dep, 0, &t, + false); + panfrost_flush_all_batches(ctx, "Launch grid post-barrier"); } static void * -panfrost_create_rasterizer_state( - struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso) +panfrost_create_rasterizer_state(struct pipe_context *pctx, + const struct pipe_rasterizer_state *cso) { - struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer); + struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer); - so->base = *cso; + so->base = *cso; - /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */ - assert(cso->offset_clamp == 0.0); + /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */ + assert(cso->offset_clamp == 0.0); #if PAN_ARCH <= 7 - pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) { - cfg.multisample_enable = cso->multisample; - cfg.fixed_function_near_discard = cso->depth_clip_near; - cfg.fixed_function_far_discard = cso->depth_clip_far; - cfg.shader_depth_range_fixed = true; - } + pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) { + cfg.multisample_enable = cso->multisample; + cfg.fixed_function_near_discard = cso->depth_clip_near; + cfg.fixed_function_far_discard = cso->depth_clip_far; + cfg.shader_depth_range_fixed = true; + } - pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) { - cfg.front_facing_depth_bias = cso->offset_tri; - cfg.back_facing_depth_bias = cso->offset_tri; - cfg.single_sampled_lines = !cso->multisample; - } + pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) { + cfg.front_facing_depth_bias = cso->offset_tri; + cfg.back_facing_depth_bias = cso->offset_tri; + cfg.single_sampled_lines = !cso->multisample; + } #endif - return so; + return so; } #if PAN_ARCH >= 9 @@ -4008,90 +3917,96 @@ panfrost_pack_attribute(struct panfrost_device *dev, const struct pipe_vertex_element el, struct mali_attribute_packed *out) { - pan_pack(out, ATTRIBUTE, cfg) { - cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER; - cfg.frequency = (el.instance_divisor > 0) ? - MALI_ATTRIBUTE_FREQUENCY_INSTANCE : - MALI_ATTRIBUTE_FREQUENCY_VERTEX; - cfg.format = dev->formats[el.src_format].hw; - cfg.offset = el.src_offset; - cfg.buffer_index = el.vertex_buffer_index; + pan_pack(out, ATTRIBUTE, cfg) { + cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER; + cfg.frequency = (el.instance_divisor > 0) + ? MALI_ATTRIBUTE_FREQUENCY_INSTANCE + : MALI_ATTRIBUTE_FREQUENCY_VERTEX; + cfg.format = dev->formats[el.src_format].hw; + cfg.offset = el.src_offset; + cfg.buffer_index = el.vertex_buffer_index; - if (el.instance_divisor == 0) { - /* Per-vertex */ - cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D; - cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX; - cfg.offset_enable = true; - } else if (util_is_power_of_two_or_zero(el.instance_divisor)) { - /* Per-instance, POT divisor */ - cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR; - cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE; - cfg.divisor_r = __builtin_ctz(el.instance_divisor); - } else { - /* Per-instance, NPOT divisor */ - cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR; - cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE; + if (el.instance_divisor == 0) { + /* Per-vertex */ + cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D; + cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX; + cfg.offset_enable = true; + } else if (util_is_power_of_two_or_zero(el.instance_divisor)) { + /* Per-instance, POT divisor */ + cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR; + cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE; + cfg.divisor_r = __builtin_ctz(el.instance_divisor); + } else { + /* Per-instance, NPOT divisor */ + cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR; + cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE; - cfg.divisor_d = - panfrost_compute_magic_divisor(el.instance_divisor, - &cfg.divisor_r, &cfg.divisor_e); - } - } + cfg.divisor_d = panfrost_compute_magic_divisor( + el.instance_divisor, &cfg.divisor_r, &cfg.divisor_e); + } + } } #endif static void * -panfrost_create_vertex_elements_state( - struct pipe_context *pctx, - unsigned num_elements, - const struct pipe_vertex_element *elements) +panfrost_create_vertex_elements_state(struct pipe_context *pctx, + unsigned num_elements, + const struct pipe_vertex_element *elements) { - struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state); - struct panfrost_device *dev = pan_device(pctx->screen); + struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state); + struct panfrost_device *dev = pan_device(pctx->screen); - so->num_elements = num_elements; - memcpy(so->pipe, elements, sizeof(*elements) * num_elements); + so->num_elements = num_elements; + memcpy(so->pipe, elements, sizeof(*elements) * num_elements); #if PAN_ARCH >= 9 - for (unsigned i = 0; i < num_elements; ++i) - panfrost_pack_attribute(dev, elements[i], &so->attributes[i]); + for (unsigned i = 0; i < num_elements; ++i) + panfrost_pack_attribute(dev, elements[i], &so->attributes[i]); #else - /* Assign attribute buffers corresponding to the vertex buffers, keyed - * for a particular divisor since that's how instancing works on Mali */ - for (unsigned i = 0; i < num_elements; ++i) { - so->element_buffer[i] = pan_assign_vertex_buffer( - so->buffers, &so->nr_bufs, - elements[i].vertex_buffer_index, - elements[i].instance_divisor); - } + /* Assign attribute buffers corresponding to the vertex buffers, keyed + * for a particular divisor since that's how instancing works on Mali */ + for (unsigned i = 0; i < num_elements; ++i) { + so->element_buffer[i] = pan_assign_vertex_buffer( + so->buffers, &so->nr_bufs, elements[i].vertex_buffer_index, + elements[i].instance_divisor); + } - for (int i = 0; i < num_elements; ++i) { - enum pipe_format fmt = elements[i].src_format; - so->formats[i] = dev->formats[fmt].hw; - } + for (int i = 0; i < num_elements; ++i) { + enum pipe_format fmt = elements[i].src_format; + so->formats[i] = dev->formats[fmt].hw; + } - /* Let's also prepare vertex builtins */ - so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw; - so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw; + /* Let's also prepare vertex builtins */ + so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw; + so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw; #endif - return so; + return so; } static inline unsigned pan_pipe_to_stencil_op(enum pipe_stencil_op in) { - switch (in) { - case PIPE_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP; - case PIPE_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO; - case PIPE_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE; - case PIPE_STENCIL_OP_INCR: return MALI_STENCIL_OP_INCR_SAT; - case PIPE_STENCIL_OP_DECR: return MALI_STENCIL_OP_DECR_SAT; - case PIPE_STENCIL_OP_INCR_WRAP: return MALI_STENCIL_OP_INCR_WRAP; - case PIPE_STENCIL_OP_DECR_WRAP: return MALI_STENCIL_OP_DECR_WRAP; - case PIPE_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT; - default: unreachable("Invalid stencil op"); - } + switch (in) { + case PIPE_STENCIL_OP_KEEP: + return MALI_STENCIL_OP_KEEP; + case PIPE_STENCIL_OP_ZERO: + return MALI_STENCIL_OP_ZERO; + case PIPE_STENCIL_OP_REPLACE: + return MALI_STENCIL_OP_REPLACE; + case PIPE_STENCIL_OP_INCR: + return MALI_STENCIL_OP_INCR_SAT; + case PIPE_STENCIL_OP_DECR: + return MALI_STENCIL_OP_DECR_SAT; + case PIPE_STENCIL_OP_INCR_WRAP: + return MALI_STENCIL_OP_INCR_WRAP; + case PIPE_STENCIL_OP_DECR_WRAP: + return MALI_STENCIL_OP_DECR_WRAP; + case PIPE_STENCIL_OP_INVERT: + return MALI_STENCIL_OP_INVERT; + default: + unreachable("Invalid stencil op"); + } } #if PAN_ARCH <= 7 @@ -4099,127 +4014,126 @@ static inline void pan_pipe_to_stencil(const struct pipe_stencil_state *in, struct mali_stencil_packed *out) { - pan_pack(out, STENCIL, s) { - s.mask = in->valuemask; - s.compare_function = (enum mali_func) in->func; - s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op); - s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op); - s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op); - } + pan_pack(out, STENCIL, s) { + s.mask = in->valuemask; + s.compare_function = (enum mali_func)in->func; + s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op); + s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op); + s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op); + } } #endif static bool pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state *zsa) { - if (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS) - return false; + if (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS) + return false; - if (zsa->stencil[0].enabled && zsa->stencil[0].func != PIPE_FUNC_ALWAYS) - return false; + if (zsa->stencil[0].enabled && zsa->stencil[0].func != PIPE_FUNC_ALWAYS) + return false; - if (zsa->stencil[1].enabled && zsa->stencil[1].func != PIPE_FUNC_ALWAYS) - return false; + if (zsa->stencil[1].enabled && zsa->stencil[1].func != PIPE_FUNC_ALWAYS) + return false; - return true; + return true; } static void * -panfrost_create_depth_stencil_state(struct pipe_context *pipe, - const struct pipe_depth_stencil_alpha_state *zsa) +panfrost_create_depth_stencil_state( + struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *zsa) { - struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state); - so->base = *zsa; + struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state); + so->base = *zsa; - const struct pipe_stencil_state front = zsa->stencil[0]; - const struct pipe_stencil_state back = - zsa->stencil[1].enabled ? zsa->stencil[1] : front; + const struct pipe_stencil_state front = zsa->stencil[0]; + const struct pipe_stencil_state back = + zsa->stencil[1].enabled ? zsa->stencil[1] : front; - enum mali_func depth_func = zsa->depth_enabled ? - (enum mali_func) zsa->depth_func : MALI_FUNC_ALWAYS; + enum mali_func depth_func = + zsa->depth_enabled ? (enum mali_func)zsa->depth_func : MALI_FUNC_ALWAYS; - /* Normalize (there's no separate enable) */ - if (PAN_ARCH <= 5 && !zsa->alpha_enabled) - so->base.alpha_func = MALI_FUNC_ALWAYS; + /* Normalize (there's no separate enable) */ + if (PAN_ARCH <= 5 && !zsa->alpha_enabled) + so->base.alpha_func = MALI_FUNC_ALWAYS; #if PAN_ARCH <= 7 - /* Prepack relevant parts of the Renderer State Descriptor. They will - * be ORed in at draw-time */ - pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) { - cfg.depth_function = depth_func; - cfg.depth_write_mask = zsa->depth_writemask; - } + /* Prepack relevant parts of the Renderer State Descriptor. They will + * be ORed in at draw-time */ + pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) { + cfg.depth_function = depth_func; + cfg.depth_write_mask = zsa->depth_writemask; + } - pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) { - cfg.stencil_enable = front.enabled; - cfg.stencil_mask_front = front.writemask; - cfg.stencil_mask_back = back.writemask; + pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) { + cfg.stencil_enable = front.enabled; + cfg.stencil_mask_front = front.writemask; + cfg.stencil_mask_back = back.writemask; #if PAN_ARCH <= 5 - cfg.alpha_test_compare_function = - (enum mali_func) so->base.alpha_func; + cfg.alpha_test_compare_function = (enum mali_func)so->base.alpha_func; #endif - } + } - /* Stencil tests have their own words in the RSD */ - pan_pipe_to_stencil(&front, &so->stencil_front); - pan_pipe_to_stencil(&back, &so->stencil_back); + /* Stencil tests have their own words in the RSD */ + pan_pipe_to_stencil(&front, &so->stencil_front); + pan_pipe_to_stencil(&back, &so->stencil_back); #else - pan_pack(&so->desc, DEPTH_STENCIL, cfg) { - cfg.front_compare_function = (enum mali_func) front.func; - cfg.front_stencil_fail = pan_pipe_to_stencil_op(front.fail_op); - cfg.front_depth_fail = pan_pipe_to_stencil_op(front.zfail_op); - cfg.front_depth_pass = pan_pipe_to_stencil_op(front.zpass_op); + pan_pack(&so->desc, DEPTH_STENCIL, cfg) { + cfg.front_compare_function = (enum mali_func)front.func; + cfg.front_stencil_fail = pan_pipe_to_stencil_op(front.fail_op); + cfg.front_depth_fail = pan_pipe_to_stencil_op(front.zfail_op); + cfg.front_depth_pass = pan_pipe_to_stencil_op(front.zpass_op); - cfg.back_compare_function = (enum mali_func) back.func; - cfg.back_stencil_fail = pan_pipe_to_stencil_op(back.fail_op); - cfg.back_depth_fail = pan_pipe_to_stencil_op(back.zfail_op); - cfg.back_depth_pass = pan_pipe_to_stencil_op(back.zpass_op); + cfg.back_compare_function = (enum mali_func)back.func; + cfg.back_stencil_fail = pan_pipe_to_stencil_op(back.fail_op); + cfg.back_depth_fail = pan_pipe_to_stencil_op(back.zfail_op); + cfg.back_depth_pass = pan_pipe_to_stencil_op(back.zpass_op); - cfg.stencil_test_enable = front.enabled; - cfg.front_write_mask = front.writemask; - cfg.back_write_mask = back.writemask; - cfg.front_value_mask = front.valuemask; - cfg.back_value_mask = back.valuemask; + cfg.stencil_test_enable = front.enabled; + cfg.front_write_mask = front.writemask; + cfg.back_write_mask = back.writemask; + cfg.front_value_mask = front.valuemask; + cfg.back_value_mask = back.valuemask; - cfg.depth_write_enable = zsa->depth_writemask; - cfg.depth_function = depth_func; - } + cfg.depth_write_enable = zsa->depth_writemask; + cfg.depth_function = depth_func; + } #endif - so->enabled = zsa->stencil[0].enabled || - (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS); + so->enabled = zsa->stencil[0].enabled || + (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS); - so->zs_always_passes = pipe_zs_always_passes(zsa); - so->writes_zs = util_writes_depth_stencil(zsa); + so->zs_always_passes = pipe_zs_always_passes(zsa); + so->writes_zs = util_writes_depth_stencil(zsa); - /* TODO: Bounds test should be easy */ - assert(!zsa->depth_bounds_test); + /* TODO: Bounds test should be easy */ + assert(!zsa->depth_bounds_test); - return so; + return so; } static struct pipe_sampler_view * -panfrost_create_sampler_view( - struct pipe_context *pctx, - struct pipe_resource *texture, - const struct pipe_sampler_view *template) +panfrost_create_sampler_view(struct pipe_context *pctx, + struct pipe_resource *texture, + const struct pipe_sampler_view *template) { - struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view); + struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_sampler_view *so = + rzalloc(pctx, struct panfrost_sampler_view); - pan_legalize_afbc_format(ctx, pan_resource(texture), template->format); + pan_legalize_afbc_format(ctx, pan_resource(texture), template->format); - pipe_reference(NULL, &texture->reference); + pipe_reference(NULL, &texture->reference); - so->base = *template; - so->base.texture = texture; - so->base.reference.count = 1; - so->base.context = pctx; + so->base = *template; + so->base.texture = texture; + so->base.reference.count = 1; + so->base.context = pctx; - panfrost_create_sampler_view_bo(so, pctx, texture); + panfrost_create_sampler_view_bo(so, pctx, texture); - return (struct pipe_sampler_view *) so; + return (struct pipe_sampler_view *)so; } /* A given Gallium blend state can be encoded to the hardware in numerous, @@ -4254,255 +4168,257 @@ static void * panfrost_create_blend_state(struct pipe_context *pipe, const struct pipe_blend_state *blend) { - struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state); - so->base = *blend; + struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state); + so->base = *blend; - so->pan.logicop_enable = blend->logicop_enable; - so->pan.logicop_func = blend->logicop_func; - so->pan.rt_count = blend->max_rt + 1; + so->pan.logicop_enable = blend->logicop_enable; + so->pan.logicop_func = blend->logicop_func; + so->pan.rt_count = blend->max_rt + 1; - for (unsigned c = 0; c < so->pan.rt_count; ++c) { - unsigned g = blend->independent_blend_enable ? c : 0; - const struct pipe_rt_blend_state pipe = blend->rt[g]; - struct pan_blend_equation equation = {0}; + for (unsigned c = 0; c < so->pan.rt_count; ++c) { + unsigned g = blend->independent_blend_enable ? c : 0; + const struct pipe_rt_blend_state pipe = blend->rt[g]; + struct pan_blend_equation equation = {0}; - equation.color_mask = pipe.colormask; - equation.blend_enable = pipe.blend_enable; + equation.color_mask = pipe.colormask; + equation.blend_enable = pipe.blend_enable; - if (pipe.blend_enable) { - equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func); - equation.rgb_src_factor = util_blend_factor_to_shader(pipe.rgb_src_factor); - equation.rgb_invert_src_factor = util_blend_factor_is_inverted(pipe.rgb_src_factor); - equation.rgb_dst_factor = util_blend_factor_to_shader(pipe.rgb_dst_factor); - equation.rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe.rgb_dst_factor); - equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func); - equation.alpha_src_factor = util_blend_factor_to_shader(pipe.alpha_src_factor); - equation.alpha_invert_src_factor = util_blend_factor_is_inverted(pipe.alpha_src_factor); - equation.alpha_dst_factor = util_blend_factor_to_shader(pipe.alpha_dst_factor); - equation.alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe.alpha_dst_factor); - } + if (pipe.blend_enable) { + equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func); + equation.rgb_src_factor = + util_blend_factor_to_shader(pipe.rgb_src_factor); + equation.rgb_invert_src_factor = + util_blend_factor_is_inverted(pipe.rgb_src_factor); + equation.rgb_dst_factor = + util_blend_factor_to_shader(pipe.rgb_dst_factor); + equation.rgb_invert_dst_factor = + util_blend_factor_is_inverted(pipe.rgb_dst_factor); + equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func); + equation.alpha_src_factor = + util_blend_factor_to_shader(pipe.alpha_src_factor); + equation.alpha_invert_src_factor = + util_blend_factor_is_inverted(pipe.alpha_src_factor); + equation.alpha_dst_factor = + util_blend_factor_to_shader(pipe.alpha_dst_factor); + equation.alpha_invert_dst_factor = + util_blend_factor_is_inverted(pipe.alpha_dst_factor); + } - /* Determine some common properties */ - unsigned constant_mask = pan_blend_constant_mask(equation); - const bool supports_2src = pan_blend_supports_2src(PAN_ARCH); - so->info[c] = (struct pan_blend_info) { - .enabled = (equation.color_mask != 0), - .opaque = pan_blend_is_opaque(equation), - .constant_mask = constant_mask, + /* Determine some common properties */ + unsigned constant_mask = pan_blend_constant_mask(equation); + const bool supports_2src = pan_blend_supports_2src(PAN_ARCH); + so->info[c] = (struct pan_blend_info){ + .enabled = (equation.color_mask != 0), + .opaque = pan_blend_is_opaque(equation), + .constant_mask = constant_mask, - /* TODO: check the dest for the logicop */ - .load_dest = blend->logicop_enable || - pan_blend_reads_dest(equation), + /* TODO: check the dest for the logicop */ + .load_dest = blend->logicop_enable || pan_blend_reads_dest(equation), - /* Could this possibly be fixed-function? */ - .fixed_function = !blend->logicop_enable && - pan_blend_can_fixed_function(equation, - supports_2src) && - (!constant_mask || - pan_blend_supports_constant(PAN_ARCH, c)), + /* Could this possibly be fixed-function? */ + .fixed_function = + !blend->logicop_enable && + pan_blend_can_fixed_function(equation, supports_2src) && + (!constant_mask || pan_blend_supports_constant(PAN_ARCH, c)), - .alpha_zero_nop = pan_blend_alpha_zero_nop(equation), - .alpha_one_store = pan_blend_alpha_one_store(equation), - }; + .alpha_zero_nop = pan_blend_alpha_zero_nop(equation), + .alpha_one_store = pan_blend_alpha_one_store(equation), + }; - so->pan.rts[c].equation = equation; + so->pan.rts[c].equation = equation; - /* Bifrost needs to know if any render target loads its - * destination in the hot draw path, so precompute this */ - if (so->info[c].load_dest) - so->load_dest_mask |= BITFIELD_BIT(c); + /* Bifrost needs to know if any render target loads its + * destination in the hot draw path, so precompute this */ + if (so->info[c].load_dest) + so->load_dest_mask |= BITFIELD_BIT(c); - /* Converting equations to Mali style is expensive, do it at - * CSO create time instead of draw-time */ - if (so->info[c].fixed_function) { - so->equation[c] = pan_pack_blend(equation); - } - } + /* Converting equations to Mali style is expensive, do it at + * CSO create time instead of draw-time */ + if (so->info[c].fixed_function) { + so->equation[c] = pan_pack_blend(equation); + } + } - return so; + return so; } #if PAN_ARCH >= 9 static enum mali_flush_to_zero_mode panfrost_ftz_mode(struct pan_shader_info *info) { - if (info->ftz_fp32) { - if (info->ftz_fp16) - return MALI_FLUSH_TO_ZERO_MODE_ALWAYS; - else - return MALI_FLUSH_TO_ZERO_MODE_DX11; - } else { - /* We don't have a "flush FP16, preserve FP32" mode, but APIs - * should not be able to generate that. - */ - assert(!info->ftz_fp16 && !info->ftz_fp32); - return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS; - } + if (info->ftz_fp32) { + if (info->ftz_fp16) + return MALI_FLUSH_TO_ZERO_MODE_ALWAYS; + else + return MALI_FLUSH_TO_ZERO_MODE_DX11; + } else { + /* We don't have a "flush FP16, preserve FP32" mode, but APIs + * should not be able to generate that. + */ + assert(!info->ftz_fp16 && !info->ftz_fp32); + return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS; + } } #endif static void prepare_shader(struct panfrost_compiled_shader *state, - struct panfrost_pool *pool, bool upload) + struct panfrost_pool *pool, bool upload) { #if PAN_ARCH <= 7 - void *out = &state->partial_rsd; + void *out = &state->partial_rsd; - if (upload) { - struct panfrost_ptr ptr = - pan_pool_alloc_desc(&pool->base, RENDERER_STATE); + if (upload) { + struct panfrost_ptr ptr = + pan_pool_alloc_desc(&pool->base, RENDERER_STATE); - state->state = panfrost_pool_take_ref(pool, ptr.gpu); - out = ptr.cpu; - } + state->state = panfrost_pool_take_ref(pool, ptr.gpu); + out = ptr.cpu; + } - pan_pack(out, RENDERER_STATE, cfg) { - pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg); - - } + pan_pack(out, RENDERER_STATE, cfg) { + pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg); + } #else - assert(upload); + assert(upload); - /* The address in the shader program descriptor must be non-null, but - * the entire shader program descriptor may be omitted. - * - * See dEQP-GLES31.functional.compute.basic.empty - */ - if (!state->bin.gpu) - return; + /* The address in the shader program descriptor must be non-null, but + * the entire shader program descriptor may be omitted. + * + * See dEQP-GLES31.functional.compute.basic.empty + */ + if (!state->bin.gpu) + return; - bool vs = (state->info.stage == MESA_SHADER_VERTEX); - bool secondary_enable = (vs && state->info.vs.secondary_enable); + bool vs = (state->info.stage == MESA_SHADER_VERTEX); + bool secondary_enable = (vs && state->info.vs.secondary_enable); - unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1; - struct panfrost_ptr ptr = pan_pool_alloc_desc_array(&pool->base, - nr_variants, - SHADER_PROGRAM); + unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1; + struct panfrost_ptr ptr = + pan_pool_alloc_desc_array(&pool->base, nr_variants, SHADER_PROGRAM); - state->state = panfrost_pool_take_ref(pool, ptr.gpu); + state->state = panfrost_pool_take_ref(pool, ptr.gpu); - /* Generic, or IDVS/points */ - pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) { - cfg.stage = pan_shader_stage(&state->info); - cfg.primary_shader = true; - cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); - cfg.binary = state->bin.gpu; - cfg.preload.r48_r63 = (state->info.preload >> 48); - cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); + /* Generic, or IDVS/points */ + pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&state->info); + cfg.primary_shader = true; + cfg.register_allocation = + pan_register_allocation(state->info.work_reg_count); + cfg.binary = state->bin.gpu; + cfg.preload.r48_r63 = (state->info.preload >> 48); + cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); - if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT) - cfg.requires_helper_threads = state->info.contains_barrier; - } + if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT) + cfg.requires_helper_threads = state->info.contains_barrier; + } - if (!vs) - return; + if (!vs) + return; - /* IDVS/triangles */ - pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) { - cfg.stage = pan_shader_stage(&state->info); - cfg.primary_shader = true; - cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); - cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset; - cfg.preload.r48_r63 = (state->info.preload >> 48); - cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); - } + /* IDVS/triangles */ + pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&state->info); + cfg.primary_shader = true; + cfg.register_allocation = + pan_register_allocation(state->info.work_reg_count); + cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset; + cfg.preload.r48_r63 = (state->info.preload >> 48); + cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); + } - if (!secondary_enable) - return; + if (!secondary_enable) + return; - pan_pack(ptr.cpu + (pan_size(SHADER_PROGRAM) * 2), SHADER_PROGRAM, cfg) { - unsigned work_count = state->info.vs.secondary_work_reg_count; + pan_pack(ptr.cpu + (pan_size(SHADER_PROGRAM) * 2), SHADER_PROGRAM, cfg) { + unsigned work_count = state->info.vs.secondary_work_reg_count; - cfg.stage = pan_shader_stage(&state->info); - cfg.primary_shader = false; - cfg.register_allocation = pan_register_allocation(work_count); - cfg.binary = state->bin.gpu + state->info.vs.secondary_offset; - cfg.preload.r48_r63 = (state->info.vs.secondary_preload >> 48); - cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); - } + cfg.stage = pan_shader_stage(&state->info); + cfg.primary_shader = false; + cfg.register_allocation = pan_register_allocation(work_count); + cfg.binary = state->bin.gpu + state->info.vs.secondary_offset; + cfg.preload.r48_r63 = (state->info.vs.secondary_preload >> 48); + cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info); + } #endif } static void panfrost_get_sample_position(struct pipe_context *context, - unsigned sample_count, - unsigned sample_index, + unsigned sample_count, unsigned sample_index, float *out_value) { - panfrost_query_sample_position( - panfrost_sample_pattern(sample_count), - sample_index, - out_value); + panfrost_query_sample_position(panfrost_sample_pattern(sample_count), + sample_index, out_value); } static void screen_destroy(struct pipe_screen *pscreen) { - struct panfrost_device *dev = pan_device(pscreen); - GENX(pan_blitter_cleanup)(dev); + struct panfrost_device *dev = pan_device(pscreen); + GENX(pan_blitter_cleanup)(dev); #if PAN_GPU_INDIRECTS - GENX(pan_indirect_dispatch_cleanup)(dev); + GENX(pan_indirect_dispatch_cleanup)(dev); #endif } static void preload(struct panfrost_batch *batch, struct pan_fb_info *fb) { - GENX(pan_preload_fb)(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu, - PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL); + GENX(pan_preload_fb) + (&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu, + PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL); } static void init_batch(struct panfrost_batch *batch) { - /* Reserve the framebuffer and local storage descriptors */ - batch->framebuffer = + /* Reserve the framebuffer and local storage descriptors */ + batch->framebuffer = #if PAN_ARCH == 4 - pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER); + pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER); #else - pan_pool_alloc_desc_aggregate(&batch->pool.base, - PAN_DESC(FRAMEBUFFER), - PAN_DESC(ZS_CRC_EXTENSION), - PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); + pan_pool_alloc_desc_aggregate( + &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION), + PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); - batch->framebuffer.gpu |= MALI_FBD_TAG_IS_MFBD; + batch->framebuffer.gpu |= MALI_FBD_TAG_IS_MFBD; #endif #if PAN_ARCH >= 6 - batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); + batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); #else - /* On Midgard, the TLS is embedded in the FB descriptor */ - batch->tls = batch->framebuffer; + /* On Midgard, the TLS is embedded in the FB descriptor */ + batch->tls = batch->framebuffer; #endif } static void -panfrost_sampler_view_destroy( - struct pipe_context *pctx, - struct pipe_sampler_view *pview) +panfrost_sampler_view_destroy(struct pipe_context *pctx, + struct pipe_sampler_view *pview) { - struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview; + struct panfrost_sampler_view *view = (struct panfrost_sampler_view *)pview; - pipe_resource_reference(&pview->texture, NULL); - panfrost_bo_unreference(view->state.bo); - ralloc_free(view); + pipe_resource_reference(&pview->texture, NULL); + panfrost_bo_unreference(view->state.bo); + ralloc_free(view); } static void context_init(struct pipe_context *pipe) { - pipe->draw_vbo = panfrost_draw_vbo; - pipe->launch_grid = panfrost_launch_grid; + pipe->draw_vbo = panfrost_draw_vbo; + pipe->launch_grid = panfrost_launch_grid; - pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state; - pipe->create_rasterizer_state = panfrost_create_rasterizer_state; - pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state; - pipe->create_sampler_view = panfrost_create_sampler_view; - pipe->sampler_view_destroy = panfrost_sampler_view_destroy; - pipe->create_sampler_state = panfrost_create_sampler_state; - pipe->create_blend_state = panfrost_create_blend_state; + pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state; + pipe->create_rasterizer_state = panfrost_create_rasterizer_state; + pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state; + pipe->create_sampler_view = panfrost_create_sampler_view; + pipe->sampler_view_destroy = panfrost_sampler_view_destroy; + pipe->create_sampler_state = panfrost_create_sampler_state; + pipe->create_blend_state = panfrost_create_blend_state; - pipe->get_sample_position = panfrost_get_sample_position; + pipe->get_sample_position = panfrost_get_sample_position; } #if PAN_ARCH <= 5 @@ -4514,49 +4430,43 @@ context_init(struct pipe_context *pipe) static mali_ptr batch_get_polygon_list(struct panfrost_batch *batch) { - struct panfrost_device *dev = pan_device(batch->ctx->base.screen); + struct panfrost_device *dev = pan_device(batch->ctx->base.screen); - if (!batch->tiler_ctx.midgard.polygon_list) { - bool has_draws = batch->scoreboard.first_tiler != NULL; - unsigned size = - panfrost_tiler_get_polygon_list_size(dev, - batch->key.width, - batch->key.height, - has_draws); - size = util_next_power_of_two(size); + if (!batch->tiler_ctx.midgard.polygon_list) { + bool has_draws = batch->scoreboard.first_tiler != NULL; + unsigned size = panfrost_tiler_get_polygon_list_size( + dev, batch->key.width, batch->key.height, has_draws); + size = util_next_power_of_two(size); - /* Create the BO as invisible if we can. If there are no draws, - * we need to write the polygon list manually because there's - * no WRITE_VALUE job in the chain - */ - bool init_polygon_list = !has_draws; - batch->tiler_ctx.midgard.polygon_list = - panfrost_batch_create_bo(batch, size, - init_polygon_list ? 0 : PAN_BO_INVISIBLE, - PIPE_SHADER_VERTEX, - "Polygon list"); - panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list, - PIPE_SHADER_FRAGMENT); + /* Create the BO as invisible if we can. If there are no draws, + * we need to write the polygon list manually because there's + * no WRITE_VALUE job in the chain + */ + bool init_polygon_list = !has_draws; + batch->tiler_ctx.midgard.polygon_list = panfrost_batch_create_bo( + batch, size, init_polygon_list ? 0 : PAN_BO_INVISIBLE, + PIPE_SHADER_VERTEX, "Polygon list"); + panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list, + PIPE_SHADER_FRAGMENT); - if (init_polygon_list && dev->model->quirks.no_hierarchical_tiling) { - assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu); - uint32_t *polygon_list_body = - batch->tiler_ctx.midgard.polygon_list->ptr.cpu + - MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE; + if (init_polygon_list && dev->model->quirks.no_hierarchical_tiling) { + assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu); + uint32_t *polygon_list_body = + batch->tiler_ctx.midgard.polygon_list->ptr.cpu + + MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE; - /* Magic for Mali T720 */ - polygon_list_body[0] = 0xa0000000; - } else if (init_polygon_list) { - assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu); - uint32_t *header = - batch->tiler_ctx.midgard.polygon_list->ptr.cpu; - memset(header, 0, size); - } + /* Magic for Mali T720 */ + polygon_list_body[0] = 0xa0000000; + } else if (init_polygon_list) { + assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu); + uint32_t *header = batch->tiler_ctx.midgard.polygon_list->ptr.cpu; + memset(header, 0, size); + } - batch->tiler_ctx.midgard.disable = !has_draws; - } + batch->tiler_ctx.midgard.disable = !has_draws; + } - return batch->tiler_ctx.midgard.polygon_list->ptr.gpu; + return batch->tiler_ctx.midgard.polygon_list->ptr.gpu; } #endif @@ -4564,31 +4474,30 @@ static void init_polygon_list(struct panfrost_batch *batch) { #if PAN_ARCH <= 5 - mali_ptr polygon_list = batch_get_polygon_list(batch); - panfrost_scoreboard_initialize_tiler(&batch->pool.base, - &batch->scoreboard, - polygon_list); + mali_ptr polygon_list = batch_get_polygon_list(batch); + panfrost_scoreboard_initialize_tiler(&batch->pool.base, &batch->scoreboard, + polygon_list); #endif } void GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen) { - struct panfrost_device *dev = &screen->dev; + struct panfrost_device *dev = &screen->dev; - screen->vtbl.prepare_shader = prepare_shader; - screen->vtbl.emit_tls = emit_tls; - screen->vtbl.emit_fbd = emit_fbd; - screen->vtbl.emit_fragment_job = emit_fragment_job; - screen->vtbl.screen_destroy = screen_destroy; - screen->vtbl.preload = preload; - screen->vtbl.context_init = context_init; - screen->vtbl.init_batch = init_batch; - screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked); - screen->vtbl.init_polygon_list = init_polygon_list; - screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options); - screen->vtbl.compile_shader = GENX(pan_shader_compile); + screen->vtbl.prepare_shader = prepare_shader; + screen->vtbl.emit_tls = emit_tls; + screen->vtbl.emit_fbd = emit_fbd; + screen->vtbl.emit_fragment_job = emit_fragment_job; + screen->vtbl.screen_destroy = screen_destroy; + screen->vtbl.preload = preload; + screen->vtbl.context_init = context_init; + screen->vtbl.init_batch = init_batch; + screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked); + screen->vtbl.init_polygon_list = init_polygon_list; + screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options); + screen->vtbl.compile_shader = GENX(pan_shader_compile); - GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base, - &screen->blitter.desc_pool.base); + GENX(pan_blitter_init) + (dev, &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base); } diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index 5cac001abfe..14a244443df 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -32,745 +32,712 @@ #include "pan_context.h" #include "pan_minmax_cache.h" -#include "util/macros.h" #include "util/format/u_format.h" -#include "util/libsync.h" -#include "util/u_inlines.h" -#include "util/u_upload_mgr.h" -#include "util/u_memory.h" -#include "util/u_surface.h" -#include "util/u_vbuf.h" #include "util/half_float.h" +#include "util/libsync.h" +#include "util/macros.h" +#include "util/u_debug_cb.h" #include "util/u_helpers.h" -#include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "util/u_prim.h" #include "util/u_prim_restart.h" #include "util/u_surface.h" -#include "util/u_math.h" -#include "util/u_debug_cb.h" +#include "util/u_upload_mgr.h" +#include "util/u_vbuf.h" +#include "compiler/nir/nir_serialize.h" +#include "util/pan_lower_framebuffer.h" +#include "decode.h" #include "pan_fence.h" #include "pan_screen.h" #include "pan_util.h" -#include "decode.h" -#include "util/pan_lower_framebuffer.h" -#include "compiler/nir/nir_serialize.h" static void -panfrost_clear( - struct pipe_context *pipe, - unsigned buffers, - const struct pipe_scissor_state *scissor_state, - const union pipe_color_union *color, - double depth, unsigned stencil) +panfrost_clear(struct pipe_context *pipe, unsigned buffers, + const struct pipe_scissor_state *scissor_state, + const union pipe_color_union *color, double depth, + unsigned stencil) { - struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - if (!panfrost_render_condition_check(ctx)) - return; + if (!panfrost_render_condition_check(ctx)) + return; - /* At the start of the batch, we can clear for free */ - if (!batch->scoreboard.first_job) { - panfrost_batch_clear(batch, buffers, color, depth, stencil); - return; - } + /* At the start of the batch, we can clear for free */ + if (!batch->scoreboard.first_job) { + panfrost_batch_clear(batch, buffers, color, depth, stencil); + return; + } - /* Once there is content, clear with a fullscreen quad */ - panfrost_blitter_save(ctx, false /* render condition */); + /* Once there is content, clear with a fullscreen quad */ + panfrost_blitter_save(ctx, false /* render condition */); - perf_debug_ctx(ctx, "Clearing with quad"); - util_blitter_clear(ctx->blitter, - ctx->pipe_framebuffer.width, - ctx->pipe_framebuffer.height, - util_framebuffer_get_num_layers(&ctx->pipe_framebuffer), - buffers, color, depth, stencil, - util_framebuffer_get_num_samples(&ctx->pipe_framebuffer) > 1); + perf_debug_ctx(ctx, "Clearing with quad"); + util_blitter_clear( + ctx->blitter, ctx->pipe_framebuffer.width, ctx->pipe_framebuffer.height, + util_framebuffer_get_num_layers(&ctx->pipe_framebuffer), buffers, color, + depth, stencil, + util_framebuffer_get_num_samples(&ctx->pipe_framebuffer) > 1); } bool panfrost_writes_point_size(struct panfrost_context *ctx) { - struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; - assert(vs != NULL); + struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX]; + assert(vs != NULL); - return vs->info.vs.writes_point_size && ctx->active_prim == PIPE_PRIM_POINTS; + return vs->info.vs.writes_point_size && ctx->active_prim == PIPE_PRIM_POINTS; } /* The entire frame is in memory -- send it off to the kernel! */ void -panfrost_flush( - struct pipe_context *pipe, - struct pipe_fence_handle **fence, - unsigned flags) +panfrost_flush(struct pipe_context *pipe, struct pipe_fence_handle **fence, + unsigned flags) { - struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_device *dev = pan_device(pipe->screen); + struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_device *dev = pan_device(pipe->screen); + /* Submit all pending jobs */ + panfrost_flush_all_batches(ctx, NULL); - /* Submit all pending jobs */ - panfrost_flush_all_batches(ctx, NULL); + if (fence) { + struct pipe_fence_handle *f = panfrost_fence_create(ctx); + pipe->screen->fence_reference(pipe->screen, fence, NULL); + *fence = f; + } - if (fence) { - struct pipe_fence_handle *f = panfrost_fence_create(ctx); - pipe->screen->fence_reference(pipe->screen, fence, NULL); - *fence = f; - } - - if (dev->debug & PAN_DBG_TRACE) - pandecode_next_frame(); + if (dev->debug & PAN_DBG_TRACE) + pandecode_next_frame(); } static void panfrost_texture_barrier(struct pipe_context *pipe, unsigned flags) { - struct panfrost_context *ctx = pan_context(pipe); - panfrost_flush_all_batches(ctx, "Texture barrier"); + struct panfrost_context *ctx = pan_context(pipe); + panfrost_flush_all_batches(ctx, "Texture barrier"); } static void panfrost_set_frontend_noop(struct pipe_context *pipe, bool enable) { - struct panfrost_context *ctx = pan_context(pipe); - panfrost_flush_all_batches(ctx, "Frontend no-op change"); - ctx->is_noop = enable; + struct panfrost_context *ctx = pan_context(pipe); + panfrost_flush_all_batches(ctx, "Frontend no-op change"); + ctx->is_noop = enable; } - static void panfrost_generic_cso_delete(struct pipe_context *pctx, void *hwcso) { - free(hwcso); + free(hwcso); } static void panfrost_bind_blend_state(struct pipe_context *pipe, void *cso) { - struct panfrost_context *ctx = pan_context(pipe); - ctx->blend = cso; - ctx->dirty |= PAN_DIRTY_BLEND; + struct panfrost_context *ctx = pan_context(pipe); + ctx->blend = cso; + ctx->dirty |= PAN_DIRTY_BLEND; } static void panfrost_set_blend_color(struct pipe_context *pipe, const struct pipe_blend_color *blend_color) { - struct panfrost_context *ctx = pan_context(pipe); - ctx->dirty |= PAN_DIRTY_BLEND; + struct panfrost_context *ctx = pan_context(pipe); + ctx->dirty |= PAN_DIRTY_BLEND; - if (blend_color) - ctx->blend_color = *blend_color; + if (blend_color) + ctx->blend_color = *blend_color; } /* Create a final blend given the context */ mali_ptr -panfrost_get_blend(struct panfrost_batch *batch, unsigned rti, struct panfrost_bo **bo, unsigned *shader_offset) +panfrost_get_blend(struct panfrost_batch *batch, unsigned rti, + struct panfrost_bo **bo, unsigned *shader_offset) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_device *dev = pan_device(ctx->base.screen); - struct panfrost_blend_state *blend = ctx->blend; - struct pan_blend_info info = blend->info[rti]; - struct pipe_surface *surf = batch->key.cbufs[rti]; - enum pipe_format fmt = surf->format; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_blend_state *blend = ctx->blend; + struct pan_blend_info info = blend->info[rti]; + struct pipe_surface *surf = batch->key.cbufs[rti]; + enum pipe_format fmt = surf->format; - /* Use fixed-function if the equation permits, the format is blendable, - * and no more than one unique constant is accessed */ - if (info.fixed_function && panfrost_blendable_formats_v7[fmt].internal && - pan_blend_is_homogenous_constant(info.constant_mask, - ctx->blend_color.color)) { - return 0; - } + /* Use fixed-function if the equation permits, the format is blendable, + * and no more than one unique constant is accessed */ + if (info.fixed_function && panfrost_blendable_formats_v7[fmt].internal && + pan_blend_is_homogenous_constant(info.constant_mask, + ctx->blend_color.color)) { + return 0; + } - /* On all architectures, we can disable writes for a blend descriptor, - * at which point the format doesn't matter. - */ - if (!info.enabled) - return 0; + /* On all architectures, we can disable writes for a blend descriptor, + * at which point the format doesn't matter. + */ + if (!info.enabled) + return 0; - /* On Bifrost and newer, we can also use fixed-function for opaque - * output regardless of the format by configuring the appropriate - * conversion descriptor in the internal blend descriptor. (Midgard - * requires a blend shader even for this case.) - */ - if (dev->arch >= 6 && info.opaque) - return 0; + /* On Bifrost and newer, we can also use fixed-function for opaque + * output regardless of the format by configuring the appropriate + * conversion descriptor in the internal blend descriptor. (Midgard + * requires a blend shader even for this case.) + */ + if (dev->arch >= 6 && info.opaque) + return 0; - /* Otherwise, we need to grab a shader */ - struct pan_blend_state pan_blend = blend->pan; - unsigned nr_samples = surf->nr_samples ? : surf->texture->nr_samples; + /* Otherwise, we need to grab a shader */ + struct pan_blend_state pan_blend = blend->pan; + unsigned nr_samples = surf->nr_samples ?: surf->texture->nr_samples; - pan_blend.rts[rti].format = fmt; - pan_blend.rts[rti].nr_samples = nr_samples; - memcpy(pan_blend.constants, ctx->blend_color.color, - sizeof(pan_blend.constants)); + pan_blend.rts[rti].format = fmt; + pan_blend.rts[rti].nr_samples = nr_samples; + memcpy(pan_blend.constants, ctx->blend_color.color, + sizeof(pan_blend.constants)); - /* Upload the shader, sharing a BO */ - if (!(*bo)) { - *bo = panfrost_batch_create_bo(batch, 4096, PAN_BO_EXECUTE, - PIPE_SHADER_FRAGMENT, "Blend shader"); - } + /* Upload the shader, sharing a BO */ + if (!(*bo)) { + *bo = panfrost_batch_create_bo(batch, 4096, PAN_BO_EXECUTE, + PIPE_SHADER_FRAGMENT, "Blend shader"); + } - struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_FRAGMENT]; + struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_FRAGMENT]; - /* Default for Midgard */ - nir_alu_type col0_type = nir_type_float32; - nir_alu_type col1_type = nir_type_float32; + /* Default for Midgard */ + nir_alu_type col0_type = nir_type_float32; + nir_alu_type col1_type = nir_type_float32; - /* Bifrost has per-output types, respect them */ - if (dev->arch >= 6) { - col0_type = ss->info.bifrost.blend[rti].type; - col1_type = ss->info.bifrost.blend_src1_type; - } + /* Bifrost has per-output types, respect them */ + if (dev->arch >= 6) { + col0_type = ss->info.bifrost.blend[rti].type; + col1_type = ss->info.bifrost.blend_src1_type; + } - pthread_mutex_lock(&dev->blend_shaders.lock); - struct pan_blend_shader_variant *shader = - pan_screen(ctx->base.screen)->vtbl.get_blend_shader(dev, - &pan_blend, - col0_type, - col1_type, - rti); + pthread_mutex_lock(&dev->blend_shaders.lock); + struct pan_blend_shader_variant *shader = + pan_screen(ctx->base.screen) + ->vtbl.get_blend_shader(dev, &pan_blend, col0_type, col1_type, rti); - /* Size check and upload */ - unsigned offset = *shader_offset; - assert((offset + shader->binary.size) < 4096); - memcpy((*bo)->ptr.cpu + offset, shader->binary.data, shader->binary.size); - *shader_offset += shader->binary.size; - pthread_mutex_unlock(&dev->blend_shaders.lock); + /* Size check and upload */ + unsigned offset = *shader_offset; + assert((offset + shader->binary.size) < 4096); + memcpy((*bo)->ptr.cpu + offset, shader->binary.data, shader->binary.size); + *shader_offset += shader->binary.size; + pthread_mutex_unlock(&dev->blend_shaders.lock); - return ((*bo)->ptr.gpu + offset) | shader->first_tag; + return ((*bo)->ptr.gpu + offset) | shader->first_tag; } static void -panfrost_bind_rasterizer_state( - struct pipe_context *pctx, - void *hwcso) +panfrost_bind_rasterizer_state(struct pipe_context *pctx, void *hwcso) { - struct panfrost_context *ctx = pan_context(pctx); - ctx->rasterizer = hwcso; + struct panfrost_context *ctx = pan_context(pctx); + ctx->rasterizer = hwcso; - /* We can assume rasterizer is always dirty, the dependencies are - * too intricate to bother tracking in detail. However we could - * probably diff the renderers for viewport dirty tracking, that - * just cares about the scissor enable and the depth clips. */ - ctx->dirty |= PAN_DIRTY_SCISSOR | PAN_DIRTY_RASTERIZER; + /* We can assume rasterizer is always dirty, the dependencies are + * too intricate to bother tracking in detail. However we could + * probably diff the renderers for viewport dirty tracking, that + * just cares about the scissor enable and the depth clips. */ + ctx->dirty |= PAN_DIRTY_SCISSOR | PAN_DIRTY_RASTERIZER; } static void -panfrost_set_shader_images( - struct pipe_context *pctx, - enum pipe_shader_type shader, - unsigned start_slot, unsigned count, unsigned unbind_num_trailing_slots, - const struct pipe_image_view *iviews) +panfrost_set_shader_images(struct pipe_context *pctx, + enum pipe_shader_type shader, unsigned start_slot, + unsigned count, unsigned unbind_num_trailing_slots, + const struct pipe_image_view *iviews) { - struct panfrost_context *ctx = pan_context(pctx); - ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= PAN_DIRTY_STAGE_IMAGE; + struct panfrost_context *ctx = pan_context(pctx); + ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= PAN_DIRTY_STAGE_IMAGE; - /* Unbind start_slot...start_slot+count */ - if (!iviews) { - for (int i = start_slot; i < start_slot + count + unbind_num_trailing_slots; i++) { - pipe_resource_reference(&ctx->images[shader][i].resource, NULL); - } + /* Unbind start_slot...start_slot+count */ + if (!iviews) { + for (int i = start_slot; + i < start_slot + count + unbind_num_trailing_slots; i++) { + pipe_resource_reference(&ctx->images[shader][i].resource, NULL); + } - ctx->image_mask[shader] &= ~(((1ull << count) - 1) << start_slot); - return; - } + ctx->image_mask[shader] &= ~(((1ull << count) - 1) << start_slot); + return; + } - /* Bind start_slot...start_slot+count */ - for (int i = 0; i < count; i++) { - const struct pipe_image_view *image = &iviews[i]; - SET_BIT(ctx->image_mask[shader], 1 << (start_slot + i), image->resource); + /* Bind start_slot...start_slot+count */ + for (int i = 0; i < count; i++) { + const struct pipe_image_view *image = &iviews[i]; + SET_BIT(ctx->image_mask[shader], 1 << (start_slot + i), image->resource); - if (!image->resource) { - util_copy_image_view(&ctx->images[shader][start_slot+i], NULL); - continue; - } + if (!image->resource) { + util_copy_image_view(&ctx->images[shader][start_slot + i], NULL); + continue; + } - struct panfrost_resource *rsrc = pan_resource(image->resource); + struct panfrost_resource *rsrc = pan_resource(image->resource); - /* Images don't work with AFBC, since they require pixel-level granularity */ - if (drm_is_afbc(rsrc->image.layout.modifier)) { - pan_resource_modifier_convert(ctx, rsrc, - DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, - "Shader image"); - } + /* Images don't work with AFBC, since they require pixel-level granularity + */ + if (drm_is_afbc(rsrc->image.layout.modifier)) { + pan_resource_modifier_convert( + ctx, rsrc, DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, + "Shader image"); + } - util_copy_image_view(&ctx->images[shader][start_slot+i], image); - } + util_copy_image_view(&ctx->images[shader][start_slot + i], image); + } - /* Unbind start_slot+count...start_slot+count+unbind_num_trailing_slots */ - for (int i = 0; i < unbind_num_trailing_slots; i++) { - SET_BIT(ctx->image_mask[shader], 1 << (start_slot + count + i), NULL); - util_copy_image_view(&ctx->images[shader][start_slot+count+i], NULL); - } + /* Unbind start_slot+count...start_slot+count+unbind_num_trailing_slots */ + for (int i = 0; i < unbind_num_trailing_slots; i++) { + SET_BIT(ctx->image_mask[shader], 1 << (start_slot + count + i), NULL); + util_copy_image_view(&ctx->images[shader][start_slot + count + i], NULL); + } } static void -panfrost_bind_vertex_elements_state( - struct pipe_context *pctx, - void *hwcso) +panfrost_bind_vertex_elements_state(struct pipe_context *pctx, void *hwcso) { - struct panfrost_context *ctx = pan_context(pctx); - ctx->vertex = hwcso; - ctx->dirty |= PAN_DIRTY_VERTEX; + struct panfrost_context *ctx = pan_context(pctx); + ctx->vertex = hwcso; + ctx->dirty |= PAN_DIRTY_VERTEX; } static void -panfrost_bind_sampler_states( - struct pipe_context *pctx, - enum pipe_shader_type shader, - unsigned start_slot, unsigned num_sampler, - void **sampler) +panfrost_bind_sampler_states(struct pipe_context *pctx, + enum pipe_shader_type shader, unsigned start_slot, + unsigned num_sampler, void **sampler) { - struct panfrost_context *ctx = pan_context(pctx); - ctx->dirty_shader[shader] |= PAN_DIRTY_STAGE_SAMPLER; + struct panfrost_context *ctx = pan_context(pctx); + ctx->dirty_shader[shader] |= PAN_DIRTY_STAGE_SAMPLER; - for (unsigned i = 0; i < num_sampler; i++) { - unsigned p = start_slot + i; - ctx->samplers[shader][p] = sampler ? sampler[i] : NULL; - if (ctx->samplers[shader][p]) - ctx->valid_samplers[shader] |= BITFIELD_BIT(p); - else - ctx->valid_samplers[shader] &= ~BITFIELD_BIT(p); - } + for (unsigned i = 0; i < num_sampler; i++) { + unsigned p = start_slot + i; + ctx->samplers[shader][p] = sampler ? sampler[i] : NULL; + if (ctx->samplers[shader][p]) + ctx->valid_samplers[shader] |= BITFIELD_BIT(p); + else + ctx->valid_samplers[shader] &= ~BITFIELD_BIT(p); + } - ctx->sampler_count[shader] = util_last_bit(ctx->valid_samplers[shader]); + ctx->sampler_count[shader] = util_last_bit(ctx->valid_samplers[shader]); } static void -panfrost_set_vertex_buffers( - struct pipe_context *pctx, - unsigned start_slot, - unsigned num_buffers, - unsigned unbind_num_trailing_slots, - bool take_ownership, - const struct pipe_vertex_buffer *buffers) +panfrost_set_vertex_buffers(struct pipe_context *pctx, unsigned start_slot, + unsigned num_buffers, + unsigned unbind_num_trailing_slots, + bool take_ownership, + const struct pipe_vertex_buffer *buffers) { - struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_context *ctx = pan_context(pctx); - util_set_vertex_buffers_mask(ctx->vertex_buffers, &ctx->vb_mask, buffers, - start_slot, num_buffers, unbind_num_trailing_slots, - take_ownership); + util_set_vertex_buffers_mask(ctx->vertex_buffers, &ctx->vb_mask, buffers, + start_slot, num_buffers, + unbind_num_trailing_slots, take_ownership); - ctx->dirty |= PAN_DIRTY_VERTEX; + ctx->dirty |= PAN_DIRTY_VERTEX; } static void -panfrost_set_constant_buffer( - struct pipe_context *pctx, - enum pipe_shader_type shader, uint index, bool take_ownership, - const struct pipe_constant_buffer *buf) +panfrost_set_constant_buffer(struct pipe_context *pctx, + enum pipe_shader_type shader, uint index, + bool take_ownership, + const struct pipe_constant_buffer *buf) { - struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_constant_buffer *pbuf = &ctx->constant_buffer[shader]; + struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_constant_buffer *pbuf = &ctx->constant_buffer[shader]; - util_copy_constant_buffer(&pbuf->cb[index], buf, take_ownership); + util_copy_constant_buffer(&pbuf->cb[index], buf, take_ownership); - unsigned mask = (1 << index); + unsigned mask = (1 << index); - if (unlikely(!buf)) { - pbuf->enabled_mask &= ~mask; - return; - } + if (unlikely(!buf)) { + pbuf->enabled_mask &= ~mask; + return; + } - pbuf->enabled_mask |= mask; - ctx->dirty_shader[shader] |= PAN_DIRTY_STAGE_CONST; + pbuf->enabled_mask |= mask; + ctx->dirty_shader[shader] |= PAN_DIRTY_STAGE_CONST; } static void -panfrost_set_stencil_ref( - struct pipe_context *pctx, - const struct pipe_stencil_ref ref) +panfrost_set_stencil_ref(struct pipe_context *pctx, + const struct pipe_stencil_ref ref) { - struct panfrost_context *ctx = pan_context(pctx); - ctx->stencil_ref = ref; - ctx->dirty |= PAN_DIRTY_ZS; + struct panfrost_context *ctx = pan_context(pctx); + ctx->stencil_ref = ref; + ctx->dirty |= PAN_DIRTY_ZS; } static void -panfrost_set_sampler_views( - struct pipe_context *pctx, - enum pipe_shader_type shader, - unsigned start_slot, unsigned num_views, - unsigned unbind_num_trailing_slots, - bool take_ownership, - struct pipe_sampler_view **views) +panfrost_set_sampler_views(struct pipe_context *pctx, + enum pipe_shader_type shader, unsigned start_slot, + unsigned num_views, + unsigned unbind_num_trailing_slots, + bool take_ownership, + struct pipe_sampler_view **views) { - struct panfrost_context *ctx = pan_context(pctx); - ctx->dirty_shader[shader] |= PAN_DIRTY_STAGE_TEXTURE; + struct panfrost_context *ctx = pan_context(pctx); + ctx->dirty_shader[shader] |= PAN_DIRTY_STAGE_TEXTURE; - unsigned new_nr = 0; - unsigned i; + unsigned new_nr = 0; + unsigned i; - for (i = 0; i < num_views; ++i) { - struct pipe_sampler_view *view = views ? views[i] : NULL; - unsigned p = i + start_slot; + for (i = 0; i < num_views; ++i) { + struct pipe_sampler_view *view = views ? views[i] : NULL; + unsigned p = i + start_slot; - if (view) - new_nr = p + 1; + if (view) + new_nr = p + 1; - if (take_ownership) { - pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][p], - NULL); - ctx->sampler_views[shader][i] = (struct panfrost_sampler_view *)view; - } else { - pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][p], - view); - } - } + if (take_ownership) { + pipe_sampler_view_reference( + (struct pipe_sampler_view **)&ctx->sampler_views[shader][p], NULL); + ctx->sampler_views[shader][i] = (struct panfrost_sampler_view *)view; + } else { + pipe_sampler_view_reference( + (struct pipe_sampler_view **)&ctx->sampler_views[shader][p], view); + } + } - for (; i < num_views + unbind_num_trailing_slots; i++) { - unsigned p = i + start_slot; - pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][p], - NULL); - } + for (; i < num_views + unbind_num_trailing_slots; i++) { + unsigned p = i + start_slot; + pipe_sampler_view_reference( + (struct pipe_sampler_view **)&ctx->sampler_views[shader][p], NULL); + } - /* If the sampler view count is higher than the greatest sampler view - * we touch, it can't change */ - if (ctx->sampler_view_count[shader] > start_slot + num_views + unbind_num_trailing_slots) - return; + /* If the sampler view count is higher than the greatest sampler view + * we touch, it can't change */ + if (ctx->sampler_view_count[shader] > + start_slot + num_views + unbind_num_trailing_slots) + return; - /* If we haven't set any sampler views here, search lower numbers for - * set sampler views */ - if (new_nr == 0) { - for (i = 0; i < start_slot; ++i) { - if (ctx->sampler_views[shader][i]) - new_nr = i + 1; - } - } + /* If we haven't set any sampler views here, search lower numbers for + * set sampler views */ + if (new_nr == 0) { + for (i = 0; i < start_slot; ++i) { + if (ctx->sampler_views[shader][i]) + new_nr = i + 1; + } + } - ctx->sampler_view_count[shader] = new_nr; + ctx->sampler_view_count[shader] = new_nr; } static void -panfrost_set_shader_buffers( - struct pipe_context *pctx, - enum pipe_shader_type shader, - unsigned start, unsigned count, - const struct pipe_shader_buffer *buffers, - unsigned writable_bitmask) +panfrost_set_shader_buffers(struct pipe_context *pctx, + enum pipe_shader_type shader, unsigned start, + unsigned count, + const struct pipe_shader_buffer *buffers, + unsigned writable_bitmask) { - struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_context *ctx = pan_context(pctx); - util_set_shader_buffers_mask(ctx->ssbo[shader], &ctx->ssbo_mask[shader], - buffers, start, count); + util_set_shader_buffers_mask(ctx->ssbo[shader], &ctx->ssbo_mask[shader], + buffers, start, count); - ctx->dirty_shader[shader] |= PAN_DIRTY_STAGE_SSBO; + ctx->dirty_shader[shader] |= PAN_DIRTY_STAGE_SSBO; } static void panfrost_set_framebuffer_state(struct pipe_context *pctx, const struct pipe_framebuffer_state *fb) { - struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_context *ctx = pan_context(pctx); - util_copy_framebuffer_state(&ctx->pipe_framebuffer, fb); - ctx->batch = NULL; + util_copy_framebuffer_state(&ctx->pipe_framebuffer, fb); + ctx->batch = NULL; - /* Hot draw call path needs the mask of active render targets */ - ctx->fb_rt_mask = 0; + /* Hot draw call path needs the mask of active render targets */ + ctx->fb_rt_mask = 0; - for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) { - if (ctx->pipe_framebuffer.cbufs[i]) - ctx->fb_rt_mask |= BITFIELD_BIT(i); - } + for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) { + if (ctx->pipe_framebuffer.cbufs[i]) + ctx->fb_rt_mask |= BITFIELD_BIT(i); + } } static void -panfrost_bind_depth_stencil_state(struct pipe_context *pipe, - void *cso) +panfrost_bind_depth_stencil_state(struct pipe_context *pipe, void *cso) { - struct panfrost_context *ctx = pan_context(pipe); - ctx->depth_stencil = cso; - ctx->dirty |= PAN_DIRTY_ZS; + struct panfrost_context *ctx = pan_context(pipe); + ctx->depth_stencil = cso; + ctx->dirty |= PAN_DIRTY_ZS; } static void -panfrost_set_sample_mask(struct pipe_context *pipe, - unsigned sample_mask) +panfrost_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask) { - struct panfrost_context *ctx = pan_context(pipe); - ctx->sample_mask = sample_mask; - ctx->dirty |= PAN_DIRTY_MSAA; + struct panfrost_context *ctx = pan_context(pipe); + ctx->sample_mask = sample_mask; + ctx->dirty |= PAN_DIRTY_MSAA; } static void -panfrost_set_min_samples(struct pipe_context *pipe, - unsigned min_samples) +panfrost_set_min_samples(struct pipe_context *pipe, unsigned min_samples) { - struct panfrost_context *ctx = pan_context(pipe); - ctx->min_samples = min_samples; - ctx->dirty |= PAN_DIRTY_MSAA; + struct panfrost_context *ctx = pan_context(pipe); + ctx->min_samples = min_samples; + ctx->dirty |= PAN_DIRTY_MSAA; } static void panfrost_set_clip_state(struct pipe_context *pipe, const struct pipe_clip_state *clip) { - //struct panfrost_context *panfrost = pan_context(pipe); + // struct panfrost_context *panfrost = pan_context(pipe); } static void -panfrost_set_viewport_states(struct pipe_context *pipe, - unsigned start_slot, +panfrost_set_viewport_states(struct pipe_context *pipe, unsigned start_slot, unsigned num_viewports, const struct pipe_viewport_state *viewports) { - struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_context *ctx = pan_context(pipe); - assert(start_slot == 0); - assert(num_viewports == 1); + assert(start_slot == 0); + assert(num_viewports == 1); - ctx->pipe_viewport = *viewports; - ctx->dirty |= PAN_DIRTY_VIEWPORT; + ctx->pipe_viewport = *viewports; + ctx->dirty |= PAN_DIRTY_VIEWPORT; } static void -panfrost_set_scissor_states(struct pipe_context *pipe, - unsigned start_slot, +panfrost_set_scissor_states(struct pipe_context *pipe, unsigned start_slot, unsigned num_scissors, const struct pipe_scissor_state *scissors) { - struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_context *ctx = pan_context(pipe); - assert(start_slot == 0); - assert(num_scissors == 1); + assert(start_slot == 0); + assert(num_scissors == 1); - ctx->scissor = *scissors; - ctx->dirty |= PAN_DIRTY_SCISSOR; + ctx->scissor = *scissors; + ctx->dirty |= PAN_DIRTY_SCISSOR; } static void panfrost_set_polygon_stipple(struct pipe_context *pipe, const struct pipe_poly_stipple *stipple) { - //struct panfrost_context *panfrost = pan_context(pipe); + // struct panfrost_context *panfrost = pan_context(pipe); } static void -panfrost_set_active_query_state(struct pipe_context *pipe, - bool enable) +panfrost_set_active_query_state(struct pipe_context *pipe, bool enable) { - struct panfrost_context *ctx = pan_context(pipe); - ctx->active_queries = enable; - ctx->dirty |= PAN_DIRTY_OQ; + struct panfrost_context *ctx = pan_context(pipe); + ctx->active_queries = enable; + ctx->dirty |= PAN_DIRTY_OQ; } static void -panfrost_render_condition(struct pipe_context *pipe, - struct pipe_query *query, - bool condition, - enum pipe_render_cond_flag mode) +panfrost_render_condition(struct pipe_context *pipe, struct pipe_query *query, + bool condition, enum pipe_render_cond_flag mode) { - struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_context *ctx = pan_context(pipe); - ctx->cond_query = (struct panfrost_query *)query; - ctx->cond_cond = condition; - ctx->cond_mode = mode; + ctx->cond_query = (struct panfrost_query *)query; + ctx->cond_cond = condition; + ctx->cond_mode = mode; } static void panfrost_destroy(struct pipe_context *pipe) { - struct panfrost_context *panfrost = pan_context(pipe); - struct panfrost_device *dev = pan_device(pipe->screen); + struct panfrost_context *panfrost = pan_context(pipe); + struct panfrost_device *dev = pan_device(pipe->screen); - _mesa_hash_table_destroy(panfrost->writers, NULL); + _mesa_hash_table_destroy(panfrost->writers, NULL); - if (panfrost->blitter) - util_blitter_destroy(panfrost->blitter); + if (panfrost->blitter) + util_blitter_destroy(panfrost->blitter); - util_unreference_framebuffer_state(&panfrost->pipe_framebuffer); - u_upload_destroy(pipe->stream_uploader); + util_unreference_framebuffer_state(&panfrost->pipe_framebuffer); + u_upload_destroy(pipe->stream_uploader); - panfrost_pool_cleanup(&panfrost->descs); - panfrost_pool_cleanup(&panfrost->shaders); + panfrost_pool_cleanup(&panfrost->descs); + panfrost_pool_cleanup(&panfrost->shaders); - drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj); - if (panfrost->in_sync_fd != -1) - close(panfrost->in_sync_fd); + drmSyncobjDestroy(dev->fd, panfrost->in_sync_obj); + if (panfrost->in_sync_fd != -1) + close(panfrost->in_sync_fd); - drmSyncobjDestroy(dev->fd, panfrost->syncobj); - ralloc_free(pipe); + drmSyncobjDestroy(dev->fd, panfrost->syncobj); + ralloc_free(pipe); } static struct pipe_query * -panfrost_create_query(struct pipe_context *pipe, - unsigned type, - unsigned index) +panfrost_create_query(struct pipe_context *pipe, unsigned type, unsigned index) { - struct panfrost_query *q = rzalloc(pipe, struct panfrost_query); + struct panfrost_query *q = rzalloc(pipe, struct panfrost_query); - q->type = type; - q->index = index; + q->type = type; + q->index = index; - return (struct pipe_query *) q; + return (struct pipe_query *)q; } static void panfrost_destroy_query(struct pipe_context *pipe, struct pipe_query *q) { - struct panfrost_query *query = (struct panfrost_query *) q; + struct panfrost_query *query = (struct panfrost_query *)q; - if (query->rsrc) - pipe_resource_reference(&query->rsrc, NULL); + if (query->rsrc) + pipe_resource_reference(&query->rsrc, NULL); - ralloc_free(q); + ralloc_free(q); } static bool panfrost_begin_query(struct pipe_context *pipe, struct pipe_query *q) { - struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_device *dev = pan_device(ctx->base.screen); - struct panfrost_query *query = (struct panfrost_query *) q; + struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_query *query = (struct panfrost_query *)q; - switch (query->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: { - unsigned size = sizeof(uint64_t) * dev->core_id_range; + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: { + unsigned size = sizeof(uint64_t) * dev->core_id_range; - /* Allocate a resource for the query results to be stored */ - if (!query->rsrc) { - query->rsrc = pipe_buffer_create(ctx->base.screen, - PIPE_BIND_QUERY_BUFFER, 0, size); - } + /* Allocate a resource for the query results to be stored */ + if (!query->rsrc) { + query->rsrc = pipe_buffer_create(ctx->base.screen, + PIPE_BIND_QUERY_BUFFER, 0, size); + } - /* Default to 0 if nothing at all drawn. */ - uint8_t *zeroes = alloca(size); - memset(zeroes, 0, size); - pipe_buffer_write(pipe, query->rsrc, 0, size, zeroes); + /* Default to 0 if nothing at all drawn. */ + uint8_t *zeroes = alloca(size); + memset(zeroes, 0, size); + pipe_buffer_write(pipe, query->rsrc, 0, size, zeroes); - query->msaa = (ctx->pipe_framebuffer.samples > 1); - ctx->occlusion_query = query; - ctx->dirty |= PAN_DIRTY_OQ; - break; - } + query->msaa = (ctx->pipe_framebuffer.samples > 1); + ctx->occlusion_query = query; + ctx->dirty |= PAN_DIRTY_OQ; + break; + } - /* Geometry statistics are computed in the driver. XXX: geom/tess - * shaders.. */ + /* Geometry statistics are computed in the driver. XXX: geom/tess + * shaders.. */ - case PIPE_QUERY_PRIMITIVES_GENERATED: - query->start = ctx->prims_generated; - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - query->start = ctx->tf_prims_generated; - break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + query->start = ctx->prims_generated; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + query->start = ctx->tf_prims_generated; + break; - case PAN_QUERY_DRAW_CALLS: - query->start = ctx->draw_calls; - break; + case PAN_QUERY_DRAW_CALLS: + query->start = ctx->draw_calls; + break; - default: - /* TODO: timestamp queries, etc? */ - break; - } + default: + /* TODO: timestamp queries, etc? */ + break; + } - return true; + return true; } static bool panfrost_end_query(struct pipe_context *pipe, struct pipe_query *q) { - struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_query *query = (struct panfrost_query *) q; + struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_query *query = (struct panfrost_query *)q; - switch (query->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - ctx->occlusion_query = NULL; - ctx->dirty |= PAN_DIRTY_OQ; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - query->end = ctx->prims_generated; - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - query->end = ctx->tf_prims_generated; - break; - case PAN_QUERY_DRAW_CALLS: - query->end = ctx->draw_calls; - break; - } + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + ctx->occlusion_query = NULL; + ctx->dirty |= PAN_DIRTY_OQ; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + query->end = ctx->prims_generated; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + query->end = ctx->tf_prims_generated; + break; + case PAN_QUERY_DRAW_CALLS: + query->end = ctx->draw_calls; + break; + } - return true; + return true; } static bool -panfrost_get_query_result(struct pipe_context *pipe, - struct pipe_query *q, - bool wait, - union pipe_query_result *vresult) +panfrost_get_query_result(struct pipe_context *pipe, struct pipe_query *q, + bool wait, union pipe_query_result *vresult) { - struct panfrost_query *query = (struct panfrost_query *) q; - struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_device *dev = pan_device(ctx->base.screen); - struct panfrost_resource *rsrc = pan_resource(query->rsrc); + struct panfrost_query *query = (struct panfrost_query *)q; + struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_resource *rsrc = pan_resource(query->rsrc); - switch (query->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - panfrost_flush_writer(ctx, rsrc, "Occlusion query"); - panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false); + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + panfrost_flush_writer(ctx, rsrc, "Occlusion query"); + panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false); - /* Read back the query results */ - uint64_t *result = (uint64_t *) rsrc->image.data.bo->ptr.cpu; + /* Read back the query results */ + uint64_t *result = (uint64_t *)rsrc->image.data.bo->ptr.cpu; - if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) { - uint64_t passed = 0; - for (int i = 0; i < dev->core_id_range; ++i) - passed += result[i]; + if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) { + uint64_t passed = 0; + for (int i = 0; i < dev->core_id_range; ++i) + passed += result[i]; - if (dev->arch <= 5 && !query->msaa) - passed /= 4; + if (dev->arch <= 5 && !query->msaa) + passed /= 4; - vresult->u64 = passed; - } else { - vresult->b = !!result[0]; - } + vresult->u64 = passed; + } else { + vresult->b = !!result[0]; + } - break; + break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_PRIMITIVES_EMITTED: - panfrost_flush_all_batches(ctx, "Primitive count query"); - vresult->u64 = query->end - query->start; - break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_PRIMITIVES_EMITTED: + panfrost_flush_all_batches(ctx, "Primitive count query"); + vresult->u64 = query->end - query->start; + break; - case PAN_QUERY_DRAW_CALLS: - vresult->u64 = query->end - query->start; - break; + case PAN_QUERY_DRAW_CALLS: + vresult->u64 = query->end - query->start; + break; - default: - /* TODO: more queries */ - break; - } + default: + /* TODO: more queries */ + break; + } - return true; + return true; } bool panfrost_render_condition_check(struct panfrost_context *ctx) { - if (!ctx->cond_query) - return true; + if (!ctx->cond_query) + return true; - perf_debug_ctx(ctx, "Implementing conditional rendering on the CPU"); + perf_debug_ctx(ctx, "Implementing conditional rendering on the CPU"); - union pipe_query_result res = { 0 }; - bool wait = - ctx->cond_mode != PIPE_RENDER_COND_NO_WAIT && - ctx->cond_mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT; + union pipe_query_result res = {0}; + bool wait = ctx->cond_mode != PIPE_RENDER_COND_NO_WAIT && + ctx->cond_mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT; - struct pipe_query *pq = (struct pipe_query *)ctx->cond_query; + struct pipe_query *pq = (struct pipe_query *)ctx->cond_query; - if (panfrost_get_query_result(&ctx->base, pq, wait, &res)) - return res.u64 != ctx->cond_cond; + if (panfrost_get_query_result(&ctx->base, pq, wait, &res)) + return res.u64 != ctx->cond_cond; - return true; + return true; } static struct pipe_stream_output_target * @@ -779,29 +746,29 @@ panfrost_create_stream_output_target(struct pipe_context *pctx, unsigned buffer_offset, unsigned buffer_size) { - struct pipe_stream_output_target *target; + struct pipe_stream_output_target *target; - target = &rzalloc(pctx, struct panfrost_streamout_target)->base; + target = &rzalloc(pctx, struct panfrost_streamout_target)->base; - if (!target) - return NULL; + if (!target) + return NULL; - pipe_reference_init(&target->reference, 1); - pipe_resource_reference(&target->buffer, prsc); + pipe_reference_init(&target->reference, 1); + pipe_resource_reference(&target->buffer, prsc); - target->context = pctx; - target->buffer_offset = buffer_offset; - target->buffer_size = buffer_size; + target->context = pctx; + target->buffer_offset = buffer_offset; + target->buffer_size = buffer_size; - return target; + return target; } static void panfrost_stream_output_target_destroy(struct pipe_context *pctx, struct pipe_stream_output_target *target) { - pipe_resource_reference(&target->buffer, NULL); - ralloc_free(target); + pipe_resource_reference(&target->buffer, NULL); + ralloc_free(target); } static void @@ -810,200 +777,200 @@ panfrost_set_stream_output_targets(struct pipe_context *pctx, struct pipe_stream_output_target **targets, const unsigned *offsets) { - struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_streamout *so = &ctx->streamout; + struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_streamout *so = &ctx->streamout; - assert(num_targets <= ARRAY_SIZE(so->targets)); + assert(num_targets <= ARRAY_SIZE(so->targets)); - for (unsigned i = 0; i < num_targets; i++) { - if (targets[i] && offsets[i] != -1) - pan_so_target(targets[i])->offset = offsets[i]; + for (unsigned i = 0; i < num_targets; i++) { + if (targets[i] && offsets[i] != -1) + pan_so_target(targets[i])->offset = offsets[i]; - pipe_so_target_reference(&so->targets[i], targets[i]); - } + pipe_so_target_reference(&so->targets[i], targets[i]); + } - for (unsigned i = num_targets; i < so->num_targets; i++) - pipe_so_target_reference(&so->targets[i], NULL); + for (unsigned i = num_targets; i < so->num_targets; i++) + pipe_so_target_reference(&so->targets[i], NULL); - so->num_targets = num_targets; - ctx->dirty |= PAN_DIRTY_SO; + so->num_targets = num_targets; + ctx->dirty |= PAN_DIRTY_SO; } static void -panfrost_set_global_binding(struct pipe_context *pctx, - unsigned first, unsigned count, - struct pipe_resource **resources, - uint32_t **handles) +panfrost_set_global_binding(struct pipe_context *pctx, unsigned first, + unsigned count, struct pipe_resource **resources, + uint32_t **handles) { - if (!resources) - return; + if (!resources) + return; - struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - for (unsigned i = first; i < first + count; ++i) { - struct panfrost_resource *rsrc = pan_resource(resources[i]); - panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_COMPUTE); + for (unsigned i = first; i < first + count; ++i) { + struct panfrost_resource *rsrc = pan_resource(resources[i]); + panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_COMPUTE); - util_range_add(&rsrc->base, &rsrc->valid_buffer_range, - 0, rsrc->base.width0); + util_range_add(&rsrc->base, &rsrc->valid_buffer_range, 0, + rsrc->base.width0); - /* The handle points to uint32_t, but space is allocated for 64 - * bits. We need to respect the offset passed in. This interface - * is so bad. - */ - mali_ptr addr = 0; - static_assert(sizeof(addr) == 8, "size out of sync"); + /* The handle points to uint32_t, but space is allocated for 64 + * bits. We need to respect the offset passed in. This interface + * is so bad. + */ + mali_ptr addr = 0; + static_assert(sizeof(addr) == 8, "size out of sync"); - memcpy(&addr, handles[i], sizeof(addr)); - addr += rsrc->image.data.bo->ptr.gpu; + memcpy(&addr, handles[i], sizeof(addr)); + addr += rsrc->image.data.bo->ptr.gpu; - memcpy(handles[i], &addr, sizeof(addr)); - } + memcpy(handles[i], &addr, sizeof(addr)); + } } static void panfrost_memory_barrier(struct pipe_context *pctx, unsigned flags) { - /* TODO: Be smart and only flush the minimum needed, maybe emitting a - * cache flush job if that would help */ - panfrost_flush_all_batches(pan_context(pctx), "Memory barrier"); + /* TODO: Be smart and only flush the minimum needed, maybe emitting a + * cache flush job if that would help */ + panfrost_flush_all_batches(pan_context(pctx), "Memory barrier"); } static void panfrost_create_fence_fd(struct pipe_context *pctx, - struct pipe_fence_handle **pfence, - int fd, enum pipe_fd_type type) + struct pipe_fence_handle **pfence, int fd, + enum pipe_fd_type type) { - *pfence = panfrost_fence_from_fd(pan_context(pctx), fd, type); + *pfence = panfrost_fence_from_fd(pan_context(pctx), fd, type); } static void panfrost_fence_server_sync(struct pipe_context *pctx, struct pipe_fence_handle *f) { - struct panfrost_device *dev = pan_device(pctx->screen); - struct panfrost_context *ctx = pan_context(pctx); - int fd = -1, ret; + struct panfrost_device *dev = pan_device(pctx->screen); + struct panfrost_context *ctx = pan_context(pctx); + int fd = -1, ret; - ret = drmSyncobjExportSyncFile(dev->fd, f->syncobj, &fd); - assert(!ret); + ret = drmSyncobjExportSyncFile(dev->fd, f->syncobj, &fd); + assert(!ret); - sync_accumulate("panfrost", &ctx->in_sync_fd, fd); - close(fd); + sync_accumulate("panfrost", &ctx->in_sync_fd, fd); + close(fd); } struct pipe_context * panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags) { - struct panfrost_context *ctx = rzalloc(NULL, struct panfrost_context); - struct pipe_context *gallium = (struct pipe_context *) ctx; - struct panfrost_device *dev = pan_device(screen); + struct panfrost_context *ctx = rzalloc(NULL, struct panfrost_context); + struct pipe_context *gallium = (struct pipe_context *)ctx; + struct panfrost_device *dev = pan_device(screen); - gallium->screen = screen; + gallium->screen = screen; - gallium->destroy = panfrost_destroy; + gallium->destroy = panfrost_destroy; - gallium->set_framebuffer_state = panfrost_set_framebuffer_state; - gallium->set_debug_callback = u_default_set_debug_callback; + gallium->set_framebuffer_state = panfrost_set_framebuffer_state; + gallium->set_debug_callback = u_default_set_debug_callback; - gallium->create_fence_fd = panfrost_create_fence_fd; - gallium->fence_server_sync = panfrost_fence_server_sync; + gallium->create_fence_fd = panfrost_create_fence_fd; + gallium->fence_server_sync = panfrost_fence_server_sync; - gallium->flush = panfrost_flush; - gallium->clear = panfrost_clear; - gallium->clear_texture = util_clear_texture; - gallium->texture_barrier = panfrost_texture_barrier; - gallium->set_frontend_noop = panfrost_set_frontend_noop; + gallium->flush = panfrost_flush; + gallium->clear = panfrost_clear; + gallium->clear_texture = util_clear_texture; + gallium->texture_barrier = panfrost_texture_barrier; + gallium->set_frontend_noop = panfrost_set_frontend_noop; - gallium->set_vertex_buffers = panfrost_set_vertex_buffers; - gallium->set_constant_buffer = panfrost_set_constant_buffer; - gallium->set_shader_buffers = panfrost_set_shader_buffers; - gallium->set_shader_images = panfrost_set_shader_images; + gallium->set_vertex_buffers = panfrost_set_vertex_buffers; + gallium->set_constant_buffer = panfrost_set_constant_buffer; + gallium->set_shader_buffers = panfrost_set_shader_buffers; + gallium->set_shader_images = panfrost_set_shader_images; - gallium->set_stencil_ref = panfrost_set_stencil_ref; + gallium->set_stencil_ref = panfrost_set_stencil_ref; - gallium->set_sampler_views = panfrost_set_sampler_views; + gallium->set_sampler_views = panfrost_set_sampler_views; - gallium->bind_rasterizer_state = panfrost_bind_rasterizer_state; - gallium->delete_rasterizer_state = panfrost_generic_cso_delete; + gallium->bind_rasterizer_state = panfrost_bind_rasterizer_state; + gallium->delete_rasterizer_state = panfrost_generic_cso_delete; - gallium->bind_vertex_elements_state = panfrost_bind_vertex_elements_state; - gallium->delete_vertex_elements_state = panfrost_generic_cso_delete; + gallium->bind_vertex_elements_state = panfrost_bind_vertex_elements_state; + gallium->delete_vertex_elements_state = panfrost_generic_cso_delete; - gallium->delete_sampler_state = panfrost_generic_cso_delete; - gallium->bind_sampler_states = panfrost_bind_sampler_states; + gallium->delete_sampler_state = panfrost_generic_cso_delete; + gallium->bind_sampler_states = panfrost_bind_sampler_states; - gallium->bind_depth_stencil_alpha_state = panfrost_bind_depth_stencil_state; - gallium->delete_depth_stencil_alpha_state = panfrost_generic_cso_delete; + gallium->bind_depth_stencil_alpha_state = panfrost_bind_depth_stencil_state; + gallium->delete_depth_stencil_alpha_state = panfrost_generic_cso_delete; - gallium->set_sample_mask = panfrost_set_sample_mask; - gallium->set_min_samples = panfrost_set_min_samples; + gallium->set_sample_mask = panfrost_set_sample_mask; + gallium->set_min_samples = panfrost_set_min_samples; - gallium->set_clip_state = panfrost_set_clip_state; - gallium->set_viewport_states = panfrost_set_viewport_states; - gallium->set_scissor_states = panfrost_set_scissor_states; - gallium->set_polygon_stipple = panfrost_set_polygon_stipple; - gallium->set_active_query_state = panfrost_set_active_query_state; - gallium->render_condition = panfrost_render_condition; + gallium->set_clip_state = panfrost_set_clip_state; + gallium->set_viewport_states = panfrost_set_viewport_states; + gallium->set_scissor_states = panfrost_set_scissor_states; + gallium->set_polygon_stipple = panfrost_set_polygon_stipple; + gallium->set_active_query_state = panfrost_set_active_query_state; + gallium->render_condition = panfrost_render_condition; - gallium->create_query = panfrost_create_query; - gallium->destroy_query = panfrost_destroy_query; - gallium->begin_query = panfrost_begin_query; - gallium->end_query = panfrost_end_query; - gallium->get_query_result = panfrost_get_query_result; + gallium->create_query = panfrost_create_query; + gallium->destroy_query = panfrost_destroy_query; + gallium->begin_query = panfrost_begin_query; + gallium->end_query = panfrost_end_query; + gallium->get_query_result = panfrost_get_query_result; - gallium->create_stream_output_target = panfrost_create_stream_output_target; - gallium->stream_output_target_destroy = panfrost_stream_output_target_destroy; - gallium->set_stream_output_targets = panfrost_set_stream_output_targets; + gallium->create_stream_output_target = panfrost_create_stream_output_target; + gallium->stream_output_target_destroy = + panfrost_stream_output_target_destroy; + gallium->set_stream_output_targets = panfrost_set_stream_output_targets; - gallium->bind_blend_state = panfrost_bind_blend_state; - gallium->delete_blend_state = panfrost_generic_cso_delete; + gallium->bind_blend_state = panfrost_bind_blend_state; + gallium->delete_blend_state = panfrost_generic_cso_delete; - gallium->set_blend_color = panfrost_set_blend_color; + gallium->set_blend_color = panfrost_set_blend_color; - gallium->set_global_binding = panfrost_set_global_binding; - gallium->memory_barrier = panfrost_memory_barrier; + gallium->set_global_binding = panfrost_set_global_binding; + gallium->memory_barrier = panfrost_memory_barrier; - pan_screen(screen)->vtbl.context_init(gallium); + pan_screen(screen)->vtbl.context_init(gallium); - panfrost_resource_context_init(gallium); - panfrost_shader_context_init(gallium); + panfrost_resource_context_init(gallium); + panfrost_shader_context_init(gallium); - gallium->stream_uploader = u_upload_create_default(gallium); - gallium->const_uploader = gallium->stream_uploader; + gallium->stream_uploader = u_upload_create_default(gallium); + gallium->const_uploader = gallium->stream_uploader; - panfrost_pool_init(&ctx->descs, ctx, dev, - 0, 4096, "Descriptors", true, false); + panfrost_pool_init(&ctx->descs, ctx, dev, 0, 4096, "Descriptors", true, + false); - panfrost_pool_init(&ctx->shaders, ctx, dev, - PAN_BO_EXECUTE, 4096, "Shaders", true, false); + panfrost_pool_init(&ctx->shaders, ctx, dev, PAN_BO_EXECUTE, 4096, "Shaders", + true, false); - ctx->blitter = util_blitter_create(gallium); + ctx->blitter = util_blitter_create(gallium); - ctx->writers = _mesa_hash_table_create(gallium, _mesa_hash_pointer, - _mesa_key_pointer_equal); + ctx->writers = _mesa_hash_table_create(gallium, _mesa_hash_pointer, + _mesa_key_pointer_equal); - assert(ctx->blitter); + assert(ctx->blitter); - /* Prepare for render! */ + /* Prepare for render! */ - /* By default mask everything on */ - ctx->sample_mask = ~0; - ctx->active_queries = true; + /* By default mask everything on */ + ctx->sample_mask = ~0; + ctx->active_queries = true; - int ASSERTED ret; + int ASSERTED ret; - /* Create a syncobj in a signaled state. Will be updated to point to the - * last queued job out_sync every time we submit a new job. - */ - ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj); - assert(!ret && ctx->syncobj); + /* Create a syncobj in a signaled state. Will be updated to point to the + * last queued job out_sync every time we submit a new job. + */ + ret = drmSyncobjCreate(dev->fd, DRM_SYNCOBJ_CREATE_SIGNALED, &ctx->syncobj); + assert(!ret && ctx->syncobj); - /* Sync object/FD used for NATIVE_FENCE_FD. */ - ctx->in_sync_fd = -1; - ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj); - assert(!ret); + /* Sync object/FD used for NATIVE_FENCE_FD. */ + ctx->in_sync_fd = -1; + ret = drmSyncobjCreate(dev->fd, 0, &ctx->in_sync_obj); + assert(!ret); - return gallium; + return gallium; } diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index e202371c42e..4bc57521649 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -26,206 +26,207 @@ #define __BUILDER_H__ #define _LARGEFILE64_SOURCE 1 -#include #include -#include "pan_resource.h" -#include "pan_job.h" +#include #include "pan_blend_cso.h" -#include "pan_encoder.h" -#include "pan_texture.h" #include "pan_earlyzs.h" +#include "pan_encoder.h" +#include "pan_job.h" +#include "pan_resource.h" +#include "pan_texture.h" #include "pipe/p_compiler.h" -#include "util/detect.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" -#include "util/format/u_formats.h" #include "pipe/p_screen.h" #include "pipe/p_state.h" -#include "util/u_blitter.h" +#include "util/detect.h" +#include "util/format/u_formats.h" #include "util/hash_table.h" #include "util/simple_mtx.h" +#include "util/u_blitter.h" -#include "midgard/midgard_compile.h" #include "compiler/shader_enums.h" +#include "midgard/midgard_compile.h" -#define SET_BIT(lval, bit, cond) \ - if (cond) \ - lval |= (bit); \ - else \ - lval &= ~(bit); +#define SET_BIT(lval, bit, cond) \ + if (cond) \ + lval |= (bit); \ + else \ + lval &= ~(bit); /* Dirty tracking flags. 3D is for general 3D state. Shader flags are * per-stage. Renderer refers to Renderer State Descriptors. Vertex refers to * vertex attributes/elements. */ enum pan_dirty_3d { - PAN_DIRTY_VIEWPORT = BITFIELD_BIT(0), - PAN_DIRTY_SCISSOR = BITFIELD_BIT(1), - PAN_DIRTY_VERTEX = BITFIELD_BIT(2), - PAN_DIRTY_PARAMS = BITFIELD_BIT(3), - PAN_DIRTY_DRAWID = BITFIELD_BIT(4), - PAN_DIRTY_TLS_SIZE = BITFIELD_BIT(5), - PAN_DIRTY_ZS = BITFIELD_BIT(6), - PAN_DIRTY_BLEND = BITFIELD_BIT(7), - PAN_DIRTY_MSAA = BITFIELD_BIT(8), - PAN_DIRTY_OQ = BITFIELD_BIT(9), - PAN_DIRTY_RASTERIZER = BITFIELD_BIT(10), - PAN_DIRTY_POINTS = BITFIELD_BIT(11), - PAN_DIRTY_SO = BITFIELD_BIT(12), + PAN_DIRTY_VIEWPORT = BITFIELD_BIT(0), + PAN_DIRTY_SCISSOR = BITFIELD_BIT(1), + PAN_DIRTY_VERTEX = BITFIELD_BIT(2), + PAN_DIRTY_PARAMS = BITFIELD_BIT(3), + PAN_DIRTY_DRAWID = BITFIELD_BIT(4), + PAN_DIRTY_TLS_SIZE = BITFIELD_BIT(5), + PAN_DIRTY_ZS = BITFIELD_BIT(6), + PAN_DIRTY_BLEND = BITFIELD_BIT(7), + PAN_DIRTY_MSAA = BITFIELD_BIT(8), + PAN_DIRTY_OQ = BITFIELD_BIT(9), + PAN_DIRTY_RASTERIZER = BITFIELD_BIT(10), + PAN_DIRTY_POINTS = BITFIELD_BIT(11), + PAN_DIRTY_SO = BITFIELD_BIT(12), }; enum pan_dirty_shader { - PAN_DIRTY_STAGE_SHADER = BITFIELD_BIT(0), - PAN_DIRTY_STAGE_TEXTURE = BITFIELD_BIT(1), - PAN_DIRTY_STAGE_SAMPLER = BITFIELD_BIT(2), - PAN_DIRTY_STAGE_IMAGE = BITFIELD_BIT(3), - PAN_DIRTY_STAGE_CONST = BITFIELD_BIT(4), - PAN_DIRTY_STAGE_SSBO = BITFIELD_BIT(5), + PAN_DIRTY_STAGE_SHADER = BITFIELD_BIT(0), + PAN_DIRTY_STAGE_TEXTURE = BITFIELD_BIT(1), + PAN_DIRTY_STAGE_SAMPLER = BITFIELD_BIT(2), + PAN_DIRTY_STAGE_IMAGE = BITFIELD_BIT(3), + PAN_DIRTY_STAGE_CONST = BITFIELD_BIT(4), + PAN_DIRTY_STAGE_SSBO = BITFIELD_BIT(5), }; struct panfrost_constant_buffer { - struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS]; - uint32_t enabled_mask; + struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS]; + uint32_t enabled_mask; }; struct panfrost_query { - /* Passthrough from Gallium */ - unsigned type; - unsigned index; + /* Passthrough from Gallium */ + unsigned type; + unsigned index; - /* For computed queries. 64-bit to prevent overflow */ - struct { - uint64_t start; - uint64_t end; - }; + /* For computed queries. 64-bit to prevent overflow */ + struct { + uint64_t start; + uint64_t end; + }; - /* Memory for the GPU to writeback the value of the query */ - struct pipe_resource *rsrc; + /* Memory for the GPU to writeback the value of the query */ + struct pipe_resource *rsrc; - /* Whether an occlusion query is for a MSAA framebuffer */ - bool msaa; + /* Whether an occlusion query is for a MSAA framebuffer */ + bool msaa; }; struct panfrost_streamout_target { - struct pipe_stream_output_target base; - uint32_t offset; + struct pipe_stream_output_target base; + uint32_t offset; }; struct panfrost_streamout { - struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; - unsigned num_targets; + struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; + unsigned num_targets; }; struct panfrost_context { - /* Gallium context */ - struct pipe_context base; + /* Gallium context */ + struct pipe_context base; - /* Dirty global state */ - enum pan_dirty_3d dirty; + /* Dirty global state */ + enum pan_dirty_3d dirty; - /* Per shader stage dirty state */ - enum pan_dirty_shader dirty_shader[PIPE_SHADER_TYPES]; + /* Per shader stage dirty state */ + enum pan_dirty_shader dirty_shader[PIPE_SHADER_TYPES]; - /* Unowned pools, so manage yourself. */ - struct panfrost_pool descs, shaders; + /* Unowned pools, so manage yourself. */ + struct panfrost_pool descs, shaders; - /* Sync obj used to keep track of in-flight jobs. */ - uint32_t syncobj; + /* Sync obj used to keep track of in-flight jobs. */ + uint32_t syncobj; - /* Set of 32 batches. When the set is full, the LRU entry (the batch - * with the smallest seqnum) is flushed to free a slot. - */ - struct { - uint64_t seqnum; - struct panfrost_batch slots[PAN_MAX_BATCHES]; + /* Set of 32 batches. When the set is full, the LRU entry (the batch + * with the smallest seqnum) is flushed to free a slot. + */ + struct { + uint64_t seqnum; + struct panfrost_batch slots[PAN_MAX_BATCHES]; - /** Set of active batches for faster traversal */ - BITSET_DECLARE(active, PAN_MAX_BATCHES); - } batches; + /** Set of active batches for faster traversal */ + BITSET_DECLARE(active, PAN_MAX_BATCHES); + } batches; - /* Map from resources to panfrost_batches */ - struct hash_table *writers; + /* Map from resources to panfrost_batches */ + struct hash_table *writers; - /* Bound job batch */ - struct panfrost_batch *batch; + /* Bound job batch */ + struct panfrost_batch *batch; - /* Within a launch_grid call.. */ - const struct pipe_grid_info *compute_grid; + /* Within a launch_grid call.. */ + const struct pipe_grid_info *compute_grid; - struct pipe_framebuffer_state pipe_framebuffer; - struct panfrost_streamout streamout; + struct pipe_framebuffer_state pipe_framebuffer; + struct panfrost_streamout streamout; - bool active_queries; - uint64_t prims_generated; - uint64_t tf_prims_generated; - uint64_t draw_calls; - struct panfrost_query *occlusion_query; + bool active_queries; + uint64_t prims_generated; + uint64_t tf_prims_generated; + uint64_t draw_calls; + struct panfrost_query *occlusion_query; - unsigned drawid; - unsigned vertex_count; - unsigned instance_count; - unsigned offset_start; - unsigned base_vertex; - unsigned base_instance; - enum pipe_prim_type active_prim; + unsigned drawid; + unsigned vertex_count; + unsigned instance_count; + unsigned offset_start; + unsigned base_vertex; + unsigned base_instance; + enum pipe_prim_type active_prim; - /* If instancing is enabled, vertex count padded for instance; if - * it is disabled, just equal to plain vertex count */ - unsigned padded_count; + /* If instancing is enabled, vertex count padded for instance; if + * it is disabled, just equal to plain vertex count */ + unsigned padded_count; - struct panfrost_constant_buffer constant_buffer[PIPE_SHADER_TYPES]; - struct panfrost_rasterizer *rasterizer; - struct panfrost_vertex_state *vertex; + struct panfrost_constant_buffer constant_buffer[PIPE_SHADER_TYPES]; + struct panfrost_rasterizer *rasterizer; + struct panfrost_vertex_state *vertex; - struct panfrost_uncompiled_shader *uncompiled[PIPE_SHADER_TYPES]; - struct panfrost_compiled_shader *prog[PIPE_SHADER_TYPES]; + struct panfrost_uncompiled_shader *uncompiled[PIPE_SHADER_TYPES]; + struct panfrost_compiled_shader *prog[PIPE_SHADER_TYPES]; - struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS]; - uint32_t vb_mask; + struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS]; + uint32_t vb_mask; - struct pipe_shader_buffer ssbo[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS]; - uint32_t ssbo_mask[PIPE_SHADER_TYPES]; + struct pipe_shader_buffer ssbo[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS]; + uint32_t ssbo_mask[PIPE_SHADER_TYPES]; - struct pipe_image_view images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES]; - uint32_t image_mask[PIPE_SHADER_TYPES]; + struct pipe_image_view images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES]; + uint32_t image_mask[PIPE_SHADER_TYPES]; - struct panfrost_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; - unsigned sampler_count[PIPE_SHADER_TYPES]; - uint32_t valid_samplers[PIPE_SHADER_TYPES]; + struct panfrost_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; + unsigned sampler_count[PIPE_SHADER_TYPES]; + uint32_t valid_samplers[PIPE_SHADER_TYPES]; - struct panfrost_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; - unsigned sampler_view_count[PIPE_SHADER_TYPES]; + struct panfrost_sampler_view + *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; + unsigned sampler_view_count[PIPE_SHADER_TYPES]; - struct blitter_context *blitter; + struct blitter_context *blitter; - struct panfrost_blend_state *blend; + struct panfrost_blend_state *blend; - /* On Valhall, does the current blend state use a blend shader for any - * output? We need this information in a hot path to decide if - * per-sample shading should be enabled. - */ - bool valhall_has_blend_shader; + /* On Valhall, does the current blend state use a blend shader for any + * output? We need this information in a hot path to decide if + * per-sample shading should be enabled. + */ + bool valhall_has_blend_shader; - struct pipe_viewport_state pipe_viewport; - struct pipe_scissor_state scissor; - struct pipe_blend_color blend_color; - struct panfrost_zsa_state *depth_stencil; - struct pipe_stencil_ref stencil_ref; - uint16_t sample_mask; - unsigned min_samples; + struct pipe_viewport_state pipe_viewport; + struct pipe_scissor_state scissor; + struct pipe_blend_color blend_color; + struct panfrost_zsa_state *depth_stencil; + struct pipe_stencil_ref stencil_ref; + uint16_t sample_mask; + unsigned min_samples; - struct panfrost_query *cond_query; - bool cond_cond; - enum pipe_render_cond_flag cond_mode; + struct panfrost_query *cond_query; + bool cond_cond; + enum pipe_render_cond_flag cond_mode; - bool is_noop; + bool is_noop; - /* Mask of active render targets */ - uint8_t fb_rt_mask; + /* Mask of active render targets */ + uint8_t fb_rt_mask; - int in_sync_fd; - uint32_t in_sync_obj; + int in_sync_fd; + uint32_t in_sync_obj; }; /* Corresponds to the CSO */ @@ -234,19 +235,19 @@ struct panfrost_rasterizer; /* Linked varyings */ struct pan_linkage { - /* If the upload is owned by the CSO instead - * of the pool, the referenced BO. Else, - * NULL. */ - struct panfrost_bo *bo; + /* If the upload is owned by the CSO instead + * of the pool, the referenced BO. Else, + * NULL. */ + struct panfrost_bo *bo; - /* Uploaded attribute descriptors */ - mali_ptr producer, consumer; + /* Uploaded attribute descriptors */ + mali_ptr producer, consumer; - /* Varyings buffers required */ - uint32_t present; + /* Varyings buffers required */ + uint32_t present; - /* Per-vertex stride for general varying buffer */ - uint32_t stride; + /* Per-vertex stride for general varying buffer */ + uint32_t stride; }; #define RSD_WORDS 16 @@ -255,89 +256,89 @@ struct pan_linkage { * shaders with varying emulated features baked in */ struct panfrost_fs_key { - /* Number of colour buffers if gl_FragColor is written */ - unsigned nr_cbufs_for_fragcolor; + /* Number of colour buffers if gl_FragColor is written */ + unsigned nr_cbufs_for_fragcolor; - /* On Valhall, fixed_varying_mask of the linked vertex shader */ - uint32_t fixed_varying_mask; + /* On Valhall, fixed_varying_mask of the linked vertex shader */ + uint32_t fixed_varying_mask; - /* Midgard shaders that read the tilebuffer must be keyed for - * non-blendable formats - */ - enum pipe_format rt_formats[8]; + /* Midgard shaders that read the tilebuffer must be keyed for + * non-blendable formats + */ + enum pipe_format rt_formats[8]; - /* From rasterize state, to lower point sprites */ - uint16_t sprite_coord_enable; + /* From rasterize state, to lower point sprites */ + uint16_t sprite_coord_enable; - /* User clip plane lowering */ - uint8_t clip_plane_enable; + /* User clip plane lowering */ + uint8_t clip_plane_enable; }; struct panfrost_shader_key { - union { - /* Vertex shaders do not use shader keys. However, we have a - * special "transform feedback" vertex program derived from a - * vertex shader. If vs_is_xfb is set on a vertex shader, this - * is a transform feedback shader, else it is a regular - * (unkeyed) vertex shader. - */ - bool vs_is_xfb; + union { + /* Vertex shaders do not use shader keys. However, we have a + * special "transform feedback" vertex program derived from a + * vertex shader. If vs_is_xfb is set on a vertex shader, this + * is a transform feedback shader, else it is a regular + * (unkeyed) vertex shader. + */ + bool vs_is_xfb; - /* Fragment shaders use regular shader keys */ - struct panfrost_fs_key fs; - }; + /* Fragment shaders use regular shader keys */ + struct panfrost_fs_key fs; + }; }; struct panfrost_compiled_shader { - /* Respectively, shader binary and Renderer State Descriptor */ - struct panfrost_pool_ref bin, state; + /* Respectively, shader binary and Renderer State Descriptor */ + struct panfrost_pool_ref bin, state; - /* For fragment shaders, a prepared (but not uploaded RSD) */ - uint32_t partial_rsd[RSD_WORDS]; + /* For fragment shaders, a prepared (but not uploaded RSD) */ + uint32_t partial_rsd[RSD_WORDS]; - struct pan_shader_info info; + struct pan_shader_info info; - struct pan_earlyzs_lut earlyzs; + struct pan_earlyzs_lut earlyzs; - /* Linked varyings, for non-separable programs */ - struct pan_linkage linkage; + /* Linked varyings, for non-separable programs */ + struct pan_linkage linkage; - struct pipe_stream_output_info stream_output; + struct pipe_stream_output_info stream_output; - struct panfrost_shader_key key; + struct panfrost_shader_key key; - /* Mask of state that dirties the sysvals */ - unsigned dirty_3d, dirty_shader; + /* Mask of state that dirties the sysvals */ + unsigned dirty_3d, dirty_shader; }; /* Shader CSO */ struct panfrost_uncompiled_shader { - /* NIR for the shader. For graphics, this will be non-NULL even for - * TGSI. For compute, this will be NULL after the shader is compiled, - * as we don't need any compute variants. - */ - const nir_shader *nir; + /* NIR for the shader. For graphics, this will be non-NULL even for + * TGSI. For compute, this will be NULL after the shader is compiled, + * as we don't need any compute variants. + */ + const nir_shader *nir; - /* A SHA1 of the serialized NIR for the disk cache. */ - unsigned char nir_sha1[20]; + /* A SHA1 of the serialized NIR for the disk cache. */ + unsigned char nir_sha1[20]; - /* Stream output information */ - struct pipe_stream_output_info stream_output; + /* Stream output information */ + struct pipe_stream_output_info stream_output; - /** Lock for the variants array */ - simple_mtx_t lock; + /** Lock for the variants array */ + simple_mtx_t lock; - /* Array of panfrost_compiled_shader */ - struct util_dynarray variants; + /* Array of panfrost_compiled_shader */ + struct util_dynarray variants; - /* Compiled transform feedback program, if one is required */ - struct panfrost_compiled_shader *xfb; + /* Compiled transform feedback program, if one is required */ + struct panfrost_compiled_shader *xfb; - /* On vertex shaders, bit mask of special desktop-only varyings to link - * with the fragment shader. Used on Valhall to implement separable - * shaders for desktop GL. - */ - uint32_t fixed_varying_mask; + /* On vertex shaders, bit mask of special desktop-only varyings to link + * with the fragment shader. Used on Valhall to implement separable + * shaders for desktop GL. + */ + uint32_t fixed_varying_mask; }; /* The binary artefacts of compiling a shader. This differs from @@ -347,11 +348,11 @@ struct panfrost_uncompiled_shader { * This structure is serialized for the shader disk cache. */ struct panfrost_shader_binary { - /* Collected information about the compiled shader */ - struct pan_shader_info info; + /* Collected information about the compiled shader */ + struct pan_shader_info info; - /* The binary itself */ - struct util_dynarray binary; + /* The binary itself */ + struct util_dynarray binary; }; void @@ -360,28 +361,25 @@ panfrost_disk_cache_store(struct disk_cache *cache, const struct panfrost_shader_key *key, const struct panfrost_shader_binary *binary); -bool -panfrost_disk_cache_retrieve(struct disk_cache *cache, - const struct panfrost_uncompiled_shader *uncompiled, - const struct panfrost_shader_key *key, - struct panfrost_shader_binary *binary); +bool panfrost_disk_cache_retrieve( + struct disk_cache *cache, + const struct panfrost_uncompiled_shader *uncompiled, + const struct panfrost_shader_key *key, + struct panfrost_shader_binary *binary); -void -panfrost_disk_cache_init(struct panfrost_screen *screen); +void panfrost_disk_cache_init(struct panfrost_screen *screen); /** (Vertex buffer index, divisor) tuple that will become an Attribute Buffer * Descriptor at draw-time on Midgard */ struct pan_vertex_buffer { - unsigned vbi; - unsigned divisor; + unsigned vbi; + unsigned divisor; }; -unsigned -pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers, - unsigned *nr_bufs, - unsigned vbi, - unsigned divisor); +unsigned pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers, + unsigned *nr_bufs, unsigned vbi, + unsigned divisor); struct panfrost_zsa_state; struct panfrost_sampler_state; @@ -391,39 +389,32 @@ struct panfrost_vertex_state; static inline struct panfrost_context * pan_context(struct pipe_context *pcontext) { - return (struct panfrost_context *) pcontext; + return (struct panfrost_context *)pcontext; } static inline struct panfrost_streamout_target * pan_so_target(struct pipe_stream_output_target *target) { - return (struct panfrost_streamout_target *)target; + return (struct panfrost_streamout_target *)target; } -struct pipe_context * -panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags); +struct pipe_context *panfrost_create_context(struct pipe_screen *screen, + void *priv, unsigned flags); -bool -panfrost_writes_point_size(struct panfrost_context *ctx); +bool panfrost_writes_point_size(struct panfrost_context *ctx); -struct panfrost_ptr -panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler); +struct panfrost_ptr panfrost_vertex_tiler_job(struct panfrost_context *ctx, + bool is_tiler); -void -panfrost_flush( - struct pipe_context *pipe, - struct pipe_fence_handle **fence, - unsigned flags); +void panfrost_flush(struct pipe_context *pipe, struct pipe_fence_handle **fence, + unsigned flags); -bool -panfrost_render_condition_check(struct panfrost_context *ctx); +bool panfrost_render_condition_check(struct panfrost_context *ctx); -void -panfrost_update_shader_variant(struct panfrost_context *ctx, - enum pipe_shader_type type); +void panfrost_update_shader_variant(struct panfrost_context *ctx, + enum pipe_shader_type type); -void -panfrost_analyze_sysvals(struct panfrost_compiled_shader *ss); +void panfrost_analyze_sysvals(struct panfrost_compiled_shader *ss); mali_ptr panfrost_get_index_buffer(struct panfrost_batch *batch, @@ -438,41 +429,37 @@ panfrost_get_index_buffer_bounded(struct panfrost_batch *batch, /* Instancing */ -mali_ptr -panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i); +mali_ptr panfrost_vertex_buffer_address(struct panfrost_context *ctx, + unsigned i); -void -panfrost_shader_context_init(struct pipe_context *pctx); +void panfrost_shader_context_init(struct pipe_context *pctx); static inline void panfrost_dirty_state_all(struct panfrost_context *ctx) { - ctx->dirty = ~0; + ctx->dirty = ~0; - for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) - ctx->dirty_shader[i] = ~0; + for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) + ctx->dirty_shader[i] = ~0; } static inline void panfrost_clean_state_3d(struct panfrost_context *ctx) { - ctx->dirty = 0; + ctx->dirty = 0; - for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) { - if (i != PIPE_SHADER_COMPUTE) - ctx->dirty_shader[i] = 0; - } + for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) { + if (i != PIPE_SHADER_COMPUTE) + ctx->dirty_shader[i] = 0; + } } -void -panfrost_set_batch_masks_blend(struct panfrost_batch *batch); +void panfrost_set_batch_masks_blend(struct panfrost_batch *batch); -void -panfrost_set_batch_masks_zs(struct panfrost_batch *batch); +void panfrost_set_batch_masks_zs(struct panfrost_batch *batch); -void -panfrost_track_image_access(struct panfrost_batch *batch, - enum pipe_shader_type stage, - struct pipe_image_view *image); +void panfrost_track_image_access(struct panfrost_batch *batch, + enum pipe_shader_type stage, + struct pipe_image_view *image); #endif diff --git a/src/gallium/drivers/panfrost/pan_disk_cache.c b/src/gallium/drivers/panfrost/pan_disk_cache.c index 056825dc7d6..dfe57dd6bab 100644 --- a/src/gallium/drivers/panfrost/pan_disk_cache.c +++ b/src/gallium/drivers/panfrost/pan_disk_cache.c @@ -21,9 +21,9 @@ * DEALINGS IN THE SOFTWARE. */ -#include -#include #include +#include +#include #include #include "compiler/nir/nir.h" @@ -43,17 +43,17 @@ extern int bifrost_debug; * Compute a disk cache key for the given uncompiled shader and shader key. */ static void -panfrost_disk_cache_compute_key(struct disk_cache *cache, - const struct panfrost_uncompiled_shader *uncompiled, - const struct panfrost_shader_key *shader_key, - cache_key cache_key) +panfrost_disk_cache_compute_key( + struct disk_cache *cache, + const struct panfrost_uncompiled_shader *uncompiled, + const struct panfrost_shader_key *shader_key, cache_key cache_key) { - uint8_t data[sizeof(uncompiled->nir_sha1) + sizeof(*shader_key)]; + uint8_t data[sizeof(uncompiled->nir_sha1) + sizeof(*shader_key)]; - memcpy(data, uncompiled->nir_sha1, sizeof(uncompiled->nir_sha1)); - memcpy(data + sizeof(uncompiled->nir_sha1), shader_key, sizeof(*shader_key)); + memcpy(data, uncompiled->nir_sha1, sizeof(uncompiled->nir_sha1)); + memcpy(data + sizeof(uncompiled->nir_sha1), shader_key, sizeof(*shader_key)); - disk_cache_compute_key(cache, data, sizeof(data), cache_key); + disk_cache_compute_key(cache, data, sizeof(data), cache_key); } /** @@ -69,33 +69,33 @@ panfrost_disk_cache_store(struct disk_cache *cache, const struct panfrost_shader_binary *binary) { #ifdef ENABLE_SHADER_CACHE - if (!cache) - return; + if (!cache) + return; - cache_key cache_key; - panfrost_disk_cache_compute_key(cache, uncompiled, key, cache_key); + cache_key cache_key; + panfrost_disk_cache_compute_key(cache, uncompiled, key, cache_key); - if (debug) { - char sha1[41]; - _mesa_sha1_format(sha1, cache_key); - fprintf(stderr, "[mesa disk cache] storing %s\n", sha1); - } + if (debug) { + char sha1[41]; + _mesa_sha1_format(sha1, cache_key); + fprintf(stderr, "[mesa disk cache] storing %s\n", sha1); + } - struct blob blob; - blob_init(&blob); + struct blob blob; + blob_init(&blob); - /* We write the following data to the cache blob: - * - * 1. Size of program binary - * 2. Program binary - * 3. Shader info - */ - blob_write_uint32(&blob, binary->binary.size); - blob_write_bytes(&blob, binary->binary.data, binary->binary.size); - blob_write_bytes(&blob, &binary->info, sizeof(binary->info)); + /* We write the following data to the cache blob: + * + * 1. Size of program binary + * 2. Program binary + * 3. Shader info + */ + blob_write_uint32(&blob, binary->binary.size); + blob_write_bytes(&blob, binary->binary.data, binary->binary.size); + blob_write_bytes(&blob, &binary->info, sizeof(binary->info)); - disk_cache_put(cache, cache_key, blob.data, blob.size, NULL); - blob_finish(&blob); + disk_cache_put(cache, cache_key, blob.data, blob.size, NULL); + blob_finish(&blob); #endif } @@ -109,43 +109,43 @@ panfrost_disk_cache_retrieve(struct disk_cache *cache, struct panfrost_shader_binary *binary) { #ifdef ENABLE_SHADER_CACHE - if (!cache) - return false; + if (!cache) + return false; - cache_key cache_key; - panfrost_disk_cache_compute_key(cache, uncompiled, key, cache_key); + cache_key cache_key; + panfrost_disk_cache_compute_key(cache, uncompiled, key, cache_key); - if (debug) { - char sha1[41]; - _mesa_sha1_format(sha1, cache_key); - fprintf(stderr, "[mesa disk cache] retrieving %s: ", sha1); - } + if (debug) { + char sha1[41]; + _mesa_sha1_format(sha1, cache_key); + fprintf(stderr, "[mesa disk cache] retrieving %s: ", sha1); + } - size_t size; - void *buffer = disk_cache_get(cache, cache_key, &size); + size_t size; + void *buffer = disk_cache_get(cache, cache_key, &size); - if (debug) - fprintf(stderr, "%s\n", buffer ? "found" : "missing"); + if (debug) + fprintf(stderr, "%s\n", buffer ? "found" : "missing"); - if (!buffer) - return false; + if (!buffer) + return false; - struct blob_reader blob; - blob_reader_init(&blob, buffer, size); + struct blob_reader blob; + blob_reader_init(&blob, buffer, size); - util_dynarray_init(&binary->binary, NULL); + util_dynarray_init(&binary->binary, NULL); - uint32_t binary_size = blob_read_uint32(&blob); - void *ptr = util_dynarray_resize_bytes(&binary->binary, binary_size, 1); + uint32_t binary_size = blob_read_uint32(&blob); + void *ptr = util_dynarray_resize_bytes(&binary->binary, binary_size, 1); - blob_copy_bytes(&blob, ptr, binary_size); - blob_copy_bytes(&blob, &binary->info, sizeof(binary->info)); + blob_copy_bytes(&blob, ptr, binary_size); + blob_copy_bytes(&blob, &binary->info, sizeof(binary->info)); - free(buffer); + free(buffer); - return true; + return true; #else - return false; + return false; #endif } @@ -156,22 +156,22 @@ void panfrost_disk_cache_init(struct panfrost_screen *screen) { #ifdef ENABLE_SHADER_CACHE - const char *renderer = screen->base.get_name(&screen->base); + const char *renderer = screen->base.get_name(&screen->base); - const struct build_id_note *note = - build_id_find_nhdr_for_addr(panfrost_disk_cache_init); - assert(note && build_id_length(note) == 20); /* sha1 */ + const struct build_id_note *note = + build_id_find_nhdr_for_addr(panfrost_disk_cache_init); + assert(note && build_id_length(note) == 20); /* sha1 */ - const uint8_t *id_sha1 = build_id_data(note); - assert(id_sha1); + const uint8_t *id_sha1 = build_id_data(note); + assert(id_sha1); - char timestamp[41]; - _mesa_sha1_format(timestamp, id_sha1); + char timestamp[41]; + _mesa_sha1_format(timestamp, id_sha1); - /* Consider any flags affecting the compile when caching */ - uint64_t driver_flags = screen->dev.debug; - driver_flags |= ((uint64_t) (midgard_debug | bifrost_debug) << 32); + /* Consider any flags affecting the compile when caching */ + uint64_t driver_flags = screen->dev.debug; + driver_flags |= ((uint64_t)(midgard_debug | bifrost_debug) << 32); - screen->disk_cache = disk_cache_create(renderer, timestamp, driver_flags); + screen->disk_cache = disk_cache_create(renderer, timestamp, driver_flags); #endif } diff --git a/src/gallium/drivers/panfrost/pan_fence.c b/src/gallium/drivers/panfrost/pan_fence.c index 655644ec495..792550371f8 100644 --- a/src/gallium/drivers/panfrost/pan_fence.c +++ b/src/gallium/drivers/panfrost/pan_fence.c @@ -26,8 +26,8 @@ * SOFTWARE. */ -#include "pan_context.h" #include "pan_fence.h" +#include "pan_context.h" #include "pan_screen.h" #include "util/os_time.h" @@ -38,117 +38,112 @@ panfrost_fence_reference(struct pipe_screen *pscreen, struct pipe_fence_handle **ptr, struct pipe_fence_handle *fence) { - struct panfrost_device *dev = pan_device(pscreen); - struct pipe_fence_handle *old = *ptr; + struct panfrost_device *dev = pan_device(pscreen); + struct pipe_fence_handle *old = *ptr; - if (pipe_reference(&old->reference, &fence->reference)) { - drmSyncobjDestroy(dev->fd, old->syncobj); - free(old); - } + if (pipe_reference(&old->reference, &fence->reference)) { + drmSyncobjDestroy(dev->fd, old->syncobj); + free(old); + } - *ptr = fence; + *ptr = fence; } bool -panfrost_fence_finish(struct pipe_screen *pscreen, - struct pipe_context *ctx, - struct pipe_fence_handle *fence, - uint64_t timeout) +panfrost_fence_finish(struct pipe_screen *pscreen, struct pipe_context *ctx, + struct pipe_fence_handle *fence, uint64_t timeout) { - struct panfrost_device *dev = pan_device(pscreen); - int ret; + struct panfrost_device *dev = pan_device(pscreen); + int ret; - if (fence->signaled) - return true; + if (fence->signaled) + return true; - uint64_t abs_timeout = os_time_get_absolute_timeout(timeout); - if (abs_timeout == OS_TIMEOUT_INFINITE) - abs_timeout = INT64_MAX; + uint64_t abs_timeout = os_time_get_absolute_timeout(timeout); + if (abs_timeout == OS_TIMEOUT_INFINITE) + abs_timeout = INT64_MAX; - ret = drmSyncobjWait(dev->fd, &fence->syncobj, - 1, - abs_timeout, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, - NULL); + ret = drmSyncobjWait(dev->fd, &fence->syncobj, 1, abs_timeout, + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL); - fence->signaled = (ret >= 0); - return fence->signaled; + fence->signaled = (ret >= 0); + return fence->signaled; } int -panfrost_fence_get_fd(struct pipe_screen *screen, - struct pipe_fence_handle *f) +panfrost_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle *f) { - struct panfrost_device *dev = pan_device(screen); - int fd = -1; + struct panfrost_device *dev = pan_device(screen); + int fd = -1; - drmSyncobjExportSyncFile(dev->fd, f->syncobj, &fd); - return fd; + drmSyncobjExportSyncFile(dev->fd, f->syncobj, &fd); + return fd; } struct pipe_fence_handle * panfrost_fence_from_fd(struct panfrost_context *ctx, int fd, enum pipe_fd_type type) { - struct panfrost_device *dev = pan_device(ctx->base.screen); - int ret; + struct panfrost_device *dev = pan_device(ctx->base.screen); + int ret; - struct pipe_fence_handle *f = calloc(1, sizeof(*f)); - if (!f) - return NULL; + struct pipe_fence_handle *f = calloc(1, sizeof(*f)); + if (!f) + return NULL; - if (type == PIPE_FD_TYPE_NATIVE_SYNC) { - ret = drmSyncobjCreate(dev->fd, 0, &f->syncobj); - if (ret) { - fprintf(stderr, "create syncobj failed\n"); - goto err_free_fence; - } + if (type == PIPE_FD_TYPE_NATIVE_SYNC) { + ret = drmSyncobjCreate(dev->fd, 0, &f->syncobj); + if (ret) { + fprintf(stderr, "create syncobj failed\n"); + goto err_free_fence; + } - ret = drmSyncobjImportSyncFile(dev->fd, f->syncobj, fd); - if (ret) { - fprintf(stderr, "import syncfile failed\n"); - goto err_destroy_syncobj; - } - } else { - assert(type == PIPE_FD_TYPE_SYNCOBJ); - ret = drmSyncobjFDToHandle(dev->fd, fd, &f->syncobj); - if (ret) { - fprintf(stderr, "import syncobj FD failed\n"); - goto err_free_fence; - } - } + ret = drmSyncobjImportSyncFile(dev->fd, f->syncobj, fd); + if (ret) { + fprintf(stderr, "import syncfile failed\n"); + goto err_destroy_syncobj; + } + } else { + assert(type == PIPE_FD_TYPE_SYNCOBJ); + ret = drmSyncobjFDToHandle(dev->fd, fd, &f->syncobj); + if (ret) { + fprintf(stderr, "import syncobj FD failed\n"); + goto err_free_fence; + } + } - pipe_reference_init(&f->reference, 1); + pipe_reference_init(&f->reference, 1); - return f; + return f; err_destroy_syncobj: - drmSyncobjDestroy(dev->fd, f->syncobj); + drmSyncobjDestroy(dev->fd, f->syncobj); err_free_fence: - free(f); - return NULL; + free(f); + return NULL; } struct pipe_fence_handle * panfrost_fence_create(struct panfrost_context *ctx) { - struct panfrost_device *dev = pan_device(ctx->base.screen); - int fd = -1, ret; + struct panfrost_device *dev = pan_device(ctx->base.screen); + int fd = -1, ret; - /* Snapshot the last rendering out fence. We'd rather have another - * syncobj instead of a sync file, but this is all we get. - * (HandleToFD/FDToHandle just gives you another syncobj ID for the - * same syncobj). - */ - ret = drmSyncobjExportSyncFile(dev->fd, ctx->syncobj, &fd); - if (ret || fd == -1) { - fprintf(stderr, "export failed\n"); - return NULL; - } + /* Snapshot the last rendering out fence. We'd rather have another + * syncobj instead of a sync file, but this is all we get. + * (HandleToFD/FDToHandle just gives you another syncobj ID for the + * same syncobj). + */ + ret = drmSyncobjExportSyncFile(dev->fd, ctx->syncobj, &fd); + if (ret || fd == -1) { + fprintf(stderr, "export failed\n"); + return NULL; + } - struct pipe_fence_handle *f = - panfrost_fence_from_fd(ctx, fd, PIPE_FD_TYPE_NATIVE_SYNC); + struct pipe_fence_handle *f = + panfrost_fence_from_fd(ctx, fd, PIPE_FD_TYPE_NATIVE_SYNC); - close(fd); + close(fd); - return f; + return f; } diff --git a/src/gallium/drivers/panfrost/pan_fence.h b/src/gallium/drivers/panfrost/pan_fence.h index 350f3682343..6a8cc74dc95 100644 --- a/src/gallium/drivers/panfrost/pan_fence.h +++ b/src/gallium/drivers/panfrost/pan_fence.h @@ -30,29 +30,24 @@ struct panfrost_context; struct pipe_fence_handle { - struct pipe_reference reference; - uint32_t syncobj; - bool signaled; + struct pipe_reference reference; + uint32_t syncobj; + bool signaled; }; -void -panfrost_fence_reference(struct pipe_screen *pscreen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *fence); +void panfrost_fence_reference(struct pipe_screen *pscreen, + struct pipe_fence_handle **ptr, + struct pipe_fence_handle *fence); -bool -panfrost_fence_finish(struct pipe_screen *pscreen, - struct pipe_context *ctx, - struct pipe_fence_handle *fence, - uint64_t timeout); +bool panfrost_fence_finish(struct pipe_screen *pscreen, + struct pipe_context *ctx, + struct pipe_fence_handle *fence, uint64_t timeout); -int -panfrost_fence_get_fd(struct pipe_screen *screen, - struct pipe_fence_handle *f); +int panfrost_fence_get_fd(struct pipe_screen *screen, + struct pipe_fence_handle *f); -struct pipe_fence_handle * -panfrost_fence_from_fd(struct panfrost_context *ctx, int fd, - enum pipe_fd_type type); +struct pipe_fence_handle *panfrost_fence_from_fd(struct panfrost_context *ctx, + int fd, + enum pipe_fd_type type); -struct pipe_fence_handle * -panfrost_fence_create(struct panfrost_context *ctx); +struct pipe_fence_handle *panfrost_fence_create(struct panfrost_context *ctx); diff --git a/src/gallium/drivers/panfrost/pan_helpers.c b/src/gallium/drivers/panfrost/pan_helpers.c index 2e2b9a6189e..fb27e102fc5 100644 --- a/src/gallium/drivers/panfrost/pan_helpers.c +++ b/src/gallium/drivers/panfrost/pan_helpers.c @@ -21,66 +21,66 @@ * SOFTWARE. */ -#include "pan_context.h" #include "util/u_vbuf.h" +#include "pan_context.h" void panfrost_analyze_sysvals(struct panfrost_compiled_shader *ss) { - unsigned dirty = 0; - unsigned dirty_shader = PAN_DIRTY_STAGE_SHADER | PAN_DIRTY_STAGE_CONST; + unsigned dirty = 0; + unsigned dirty_shader = PAN_DIRTY_STAGE_SHADER | PAN_DIRTY_STAGE_CONST; - for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) { - switch (PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[i])) { - case PAN_SYSVAL_VIEWPORT_SCALE: - case PAN_SYSVAL_VIEWPORT_OFFSET: - dirty |= PAN_DIRTY_VIEWPORT; - break; + for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) { + switch (PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[i])) { + case PAN_SYSVAL_VIEWPORT_SCALE: + case PAN_SYSVAL_VIEWPORT_OFFSET: + dirty |= PAN_DIRTY_VIEWPORT; + break; - case PAN_SYSVAL_TEXTURE_SIZE: - dirty_shader |= PAN_DIRTY_STAGE_TEXTURE; - break; + case PAN_SYSVAL_TEXTURE_SIZE: + dirty_shader |= PAN_DIRTY_STAGE_TEXTURE; + break; - case PAN_SYSVAL_SSBO: - dirty_shader |= PAN_DIRTY_STAGE_SSBO; - break; + case PAN_SYSVAL_SSBO: + dirty_shader |= PAN_DIRTY_STAGE_SSBO; + break; - case PAN_SYSVAL_XFB: - dirty |= PAN_DIRTY_SO; - break; + case PAN_SYSVAL_XFB: + dirty |= PAN_DIRTY_SO; + break; - case PAN_SYSVAL_SAMPLER: - dirty_shader |= PAN_DIRTY_STAGE_SAMPLER; - break; + case PAN_SYSVAL_SAMPLER: + dirty_shader |= PAN_DIRTY_STAGE_SAMPLER; + break; - case PAN_SYSVAL_IMAGE_SIZE: - dirty_shader |= PAN_DIRTY_STAGE_IMAGE; - break; + case PAN_SYSVAL_IMAGE_SIZE: + dirty_shader |= PAN_DIRTY_STAGE_IMAGE; + break; - case PAN_SYSVAL_NUM_WORK_GROUPS: - case PAN_SYSVAL_LOCAL_GROUP_SIZE: - case PAN_SYSVAL_WORK_DIM: - case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: - case PAN_SYSVAL_NUM_VERTICES: - dirty |= PAN_DIRTY_PARAMS; - break; + case PAN_SYSVAL_NUM_WORK_GROUPS: + case PAN_SYSVAL_LOCAL_GROUP_SIZE: + case PAN_SYSVAL_WORK_DIM: + case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: + case PAN_SYSVAL_NUM_VERTICES: + dirty |= PAN_DIRTY_PARAMS; + break; - case PAN_SYSVAL_DRAWID: - dirty |= PAN_DIRTY_DRAWID; - break; + case PAN_SYSVAL_DRAWID: + dirty |= PAN_DIRTY_DRAWID; + break; - case PAN_SYSVAL_SAMPLE_POSITIONS: - case PAN_SYSVAL_MULTISAMPLED: - case PAN_SYSVAL_RT_CONVERSION: - /* Nothing beyond the batch itself */ - break; - default: - unreachable("Invalid sysval"); - } - } + case PAN_SYSVAL_SAMPLE_POSITIONS: + case PAN_SYSVAL_MULTISAMPLED: + case PAN_SYSVAL_RT_CONVERSION: + /* Nothing beyond the batch itself */ + break; + default: + unreachable("Invalid sysval"); + } + } - ss->dirty_3d = dirty; - ss->dirty_shader = dirty_shader; + ss->dirty_3d = dirty; + ss->dirty_shader = dirty_shader; } /* @@ -93,25 +93,22 @@ panfrost_get_index_buffer(struct panfrost_batch *batch, const struct pipe_draw_info *info, const struct pipe_draw_start_count_bias *draw) { - struct panfrost_resource *rsrc = pan_resource(info->index.resource); - off_t offset = draw->start * info->index_size; + struct panfrost_resource *rsrc = pan_resource(info->index.resource); + off_t offset = draw->start * info->index_size; - if (!info->has_user_indices) { - /* Only resources can be directly mapped */ - panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); - return rsrc->image.data.bo->ptr.gpu + offset; - } else { - /* Otherwise, we need to upload to transient memory */ - const uint8_t *ibuf8 = (const uint8_t *) info->index.user; - struct panfrost_ptr T = - pan_pool_alloc_aligned(&batch->pool.base, - draw->count * - info->index_size, - info->index_size); + if (!info->has_user_indices) { + /* Only resources can be directly mapped */ + panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); + return rsrc->image.data.bo->ptr.gpu + offset; + } else { + /* Otherwise, we need to upload to transient memory */ + const uint8_t *ibuf8 = (const uint8_t *)info->index.user; + struct panfrost_ptr T = pan_pool_alloc_aligned( + &batch->pool.base, draw->count * info->index_size, info->index_size); - memcpy(T.cpu, ibuf8 + offset, draw->count * info->index_size); - return T.gpu; - } + memcpy(T.cpu, ibuf8 + offset, draw->count * info->index_size); + return T.gpu; + } } /* Gets a GPU address for the associated index buffer. Only gauranteed to be @@ -126,34 +123,30 @@ panfrost_get_index_buffer_bounded(struct panfrost_batch *batch, const struct pipe_draw_start_count_bias *draw, unsigned *min_index, unsigned *max_index) { - struct panfrost_resource *rsrc = pan_resource(info->index.resource); - struct panfrost_context *ctx = batch->ctx; - bool needs_indices = true; + struct panfrost_resource *rsrc = pan_resource(info->index.resource); + struct panfrost_context *ctx = batch->ctx; + bool needs_indices = true; - if (info->index_bounds_valid) { - *min_index = info->min_index; - *max_index = info->max_index; - needs_indices = false; - } else if (!info->has_user_indices) { - /* Check the cache */ - needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache, - draw->start, - draw->count, - min_index, - max_index); - } + if (info->index_bounds_valid) { + *min_index = info->min_index; + *max_index = info->max_index; + needs_indices = false; + } else if (!info->has_user_indices) { + /* Check the cache */ + needs_indices = !panfrost_minmax_cache_get( + rsrc->index_cache, draw->start, draw->count, min_index, max_index); + } - if (needs_indices) { - /* Fallback */ - u_vbuf_get_minmax_index(&ctx->base, info, draw, min_index, max_index); + if (needs_indices) { + /* Fallback */ + u_vbuf_get_minmax_index(&ctx->base, info, draw, min_index, max_index); - if (!info->has_user_indices) - panfrost_minmax_cache_add(rsrc->index_cache, - draw->start, draw->count, - *min_index, *max_index); - } + if (!info->has_user_indices) + panfrost_minmax_cache_add(rsrc->index_cache, draw->start, draw->count, + *min_index, *max_index); + } - return panfrost_get_index_buffer(batch, info, draw); + return panfrost_get_index_buffer(batch, info, draw); } /** @@ -163,26 +156,24 @@ panfrost_get_index_buffer_bounded(struct panfrost_batch *batch, * elements CSO create time, not at draw time. */ unsigned -pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers, - unsigned *nr_bufs, - unsigned vbi, - unsigned divisor) +pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers, unsigned *nr_bufs, + unsigned vbi, unsigned divisor) { - /* Look up the buffer */ - for (unsigned i = 0; i < (*nr_bufs); ++i) { - if (buffers[i].vbi == vbi && buffers[i].divisor == divisor) - return i; - } + /* Look up the buffer */ + for (unsigned i = 0; i < (*nr_bufs); ++i) { + if (buffers[i].vbi == vbi && buffers[i].divisor == divisor) + return i; + } - /* Else, create a new buffer */ - unsigned idx = (*nr_bufs)++; + /* Else, create a new buffer */ + unsigned idx = (*nr_bufs)++; - buffers[idx] = (struct pan_vertex_buffer) { - .vbi = vbi, - .divisor = divisor, - }; + buffers[idx] = (struct pan_vertex_buffer){ + .vbi = vbi, + .divisor = divisor, + }; - return idx; + return idx; } /* @@ -194,8 +185,8 @@ pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers, static void panfrost_draw_target(struct panfrost_batch *batch, unsigned target) { - batch->draws |= target; - batch->resolve |= target; + batch->draws |= target; + batch->resolve |= target; } /* @@ -206,34 +197,34 @@ panfrost_draw_target(struct panfrost_batch *batch, unsigned target) void panfrost_set_batch_masks_blend(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; - struct panfrost_blend_state *blend = ctx->blend; + struct panfrost_context *ctx = batch->ctx; + struct panfrost_blend_state *blend = ctx->blend; - for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) { - if (blend->info[i].enabled && batch->key.cbufs[i]) - panfrost_draw_target(batch, PIPE_CLEAR_COLOR0 << i); - } + for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) { + if (blend->info[i].enabled && batch->key.cbufs[i]) + panfrost_draw_target(batch, PIPE_CLEAR_COLOR0 << i); + } } void panfrost_set_batch_masks_zs(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; - struct pipe_depth_stencil_alpha_state *zsa = (void *) ctx->depth_stencil; + struct panfrost_context *ctx = batch->ctx; + struct pipe_depth_stencil_alpha_state *zsa = (void *)ctx->depth_stencil; - /* Assume depth is read (TODO: perf) */ - if (zsa->depth_enabled) - batch->read |= PIPE_CLEAR_DEPTH; + /* Assume depth is read (TODO: perf) */ + if (zsa->depth_enabled) + batch->read |= PIPE_CLEAR_DEPTH; - if (zsa->depth_writemask) - panfrost_draw_target(batch, PIPE_CLEAR_DEPTH); + if (zsa->depth_writemask) + panfrost_draw_target(batch, PIPE_CLEAR_DEPTH); - if (zsa->stencil[0].enabled) { - panfrost_draw_target(batch, PIPE_CLEAR_STENCIL); + if (zsa->stencil[0].enabled) { + panfrost_draw_target(batch, PIPE_CLEAR_STENCIL); - /* Assume stencil is read (TODO: perf) */ - batch->read |= PIPE_CLEAR_STENCIL; - } + /* Assume stencil is read (TODO: perf) */ + batch->read |= PIPE_CLEAR_STENCIL; + } } void @@ -241,21 +232,20 @@ panfrost_track_image_access(struct panfrost_batch *batch, enum pipe_shader_type stage, struct pipe_image_view *image) { - struct panfrost_resource *rsrc = pan_resource(image->resource); + struct panfrost_resource *rsrc = pan_resource(image->resource); - if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE) { - panfrost_batch_write_rsrc(batch, rsrc, stage); + if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE) { + panfrost_batch_write_rsrc(batch, rsrc, stage); - bool is_buffer = rsrc->base.target == PIPE_BUFFER; - unsigned level = is_buffer ? 0 : image->u.tex.level; - BITSET_SET(rsrc->valid.data, level); + bool is_buffer = rsrc->base.target == PIPE_BUFFER; + unsigned level = is_buffer ? 0 : image->u.tex.level; + BITSET_SET(rsrc->valid.data, level); - if (is_buffer) { - util_range_add(&rsrc->base, &rsrc->valid_buffer_range, - 0, rsrc->base.width0); - } - } else { - panfrost_batch_read_rsrc(batch, rsrc, stage); - } + if (is_buffer) { + util_range_add(&rsrc->base, &rsrc->valid_buffer_range, 0, + rsrc->base.width0); + } + } else { + panfrost_batch_read_rsrc(batch, rsrc, stage); + } } - diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c index 0736ea41492..16516f11d28 100644 --- a/src/gallium/drivers/panfrost/pan_job.c +++ b/src/gallium/drivers/panfrost/pan_job.c @@ -28,35 +28,36 @@ #include "drm-uapi/panfrost_drm.h" -#include "pan_bo.h" -#include "pan_context.h" +#include "util/format/u_format.h" #include "util/hash_table.h" #include "util/ralloc.h" -#include "util/format/u_format.h" -#include "util/u_pack_color.h" #include "util/rounding.h" #include "util/u_framebuffer.h" -#include "pan_util.h" +#include "util/u_pack_color.h" #include "decode.h" +#include "pan_bo.h" +#include "pan_context.h" +#include "pan_util.h" -#define foreach_batch(ctx, idx) \ - BITSET_FOREACH_SET(idx, ctx->batches.active, PAN_MAX_BATCHES) +#define foreach_batch(ctx, idx) \ + BITSET_FOREACH_SET(idx, ctx->batches.active, PAN_MAX_BATCHES) static unsigned panfrost_batch_idx(struct panfrost_batch *batch) { - return batch - batch->ctx->batches.slots; + return batch - batch->ctx->batches.slots; } /* Adds the BO backing surface to a batch if the surface is non-null */ static void -panfrost_batch_add_surface(struct panfrost_batch *batch, struct pipe_surface *surf) +panfrost_batch_add_surface(struct panfrost_batch *batch, + struct pipe_surface *surf) { - if (surf) { - struct panfrost_resource *rsrc = pan_resource(surf->texture); - panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT); - } + if (surf) { + struct panfrost_resource *rsrc = pan_resource(surf->texture); + panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT); + } } static void @@ -64,115 +65,116 @@ panfrost_batch_init(struct panfrost_context *ctx, const struct pipe_framebuffer_state *key, struct panfrost_batch *batch) { - struct pipe_screen *pscreen = ctx->base.screen; - struct panfrost_screen *screen = pan_screen(pscreen); - struct panfrost_device *dev = &screen->dev; + struct pipe_screen *pscreen = ctx->base.screen; + struct panfrost_screen *screen = pan_screen(pscreen); + struct panfrost_device *dev = &screen->dev; - batch->ctx = ctx; + batch->ctx = ctx; - batch->seqnum = ++ctx->batches.seqnum; + batch->seqnum = ++ctx->batches.seqnum; - util_dynarray_init(&batch->bos, NULL); + util_dynarray_init(&batch->bos, NULL); - batch->minx = batch->miny = ~0; - batch->maxx = batch->maxy = 0; + batch->minx = batch->miny = ~0; + batch->maxx = batch->maxy = 0; - util_copy_framebuffer_state(&batch->key, key); + util_copy_framebuffer_state(&batch->key, key); - /* Preallocate the main pool, since every batch has at least one job - * structure so it will be used */ - panfrost_pool_init(&batch->pool, NULL, dev, 0, 65536, "Batch pool", true, true); + /* Preallocate the main pool, since every batch has at least one job + * structure so it will be used */ + panfrost_pool_init(&batch->pool, NULL, dev, 0, 65536, "Batch pool", true, + true); - /* Don't preallocate the invisible pool, since not every batch will use - * the pre-allocation, particularly if the varyings are larger than the - * preallocation and a reallocation is needed after anyway. */ - panfrost_pool_init(&batch->invisible_pool, NULL, dev, - PAN_BO_INVISIBLE, 65536, "Varyings", false, true); + /* Don't preallocate the invisible pool, since not every batch will use + * the pre-allocation, particularly if the varyings are larger than the + * preallocation and a reallocation is needed after anyway. */ + panfrost_pool_init(&batch->invisible_pool, NULL, dev, PAN_BO_INVISIBLE, + 65536, "Varyings", false, true); - for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) - panfrost_batch_add_surface(batch, batch->key.cbufs[i]); + for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) + panfrost_batch_add_surface(batch, batch->key.cbufs[i]); - panfrost_batch_add_surface(batch, batch->key.zsbuf); + panfrost_batch_add_surface(batch, batch->key.zsbuf); - screen->vtbl.init_batch(batch); + screen->vtbl.init_batch(batch); } static void -panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batch) +panfrost_batch_cleanup(struct panfrost_context *ctx, + struct panfrost_batch *batch) { - struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_device *dev = pan_device(ctx->base.screen); - assert(batch->seqnum); + assert(batch->seqnum); - if (ctx->batch == batch) - ctx->batch = NULL; + if (ctx->batch == batch) + ctx->batch = NULL; - unsigned batch_idx = panfrost_batch_idx(batch); + unsigned batch_idx = panfrost_batch_idx(batch); - pan_bo_access *flags = util_dynarray_begin(&batch->bos); - unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access); + pan_bo_access *flags = util_dynarray_begin(&batch->bos); + unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access); - for (int i = 0; i < end_bo; ++i) { - if (!flags[i]) - continue; + for (int i = 0; i < end_bo; ++i) { + if (!flags[i]) + continue; - struct panfrost_bo *bo = pan_lookup_bo(dev, i); - panfrost_bo_unreference(bo); - } + struct panfrost_bo *bo = pan_lookup_bo(dev, i); + panfrost_bo_unreference(bo); + } - /* There is no more writer for anything we wrote */ - hash_table_foreach(ctx->writers, ent) { - if (ent->data == batch) - _mesa_hash_table_remove(ctx->writers, ent); - } + /* There is no more writer for anything we wrote */ + hash_table_foreach(ctx->writers, ent) { + if (ent->data == batch) + _mesa_hash_table_remove(ctx->writers, ent); + } - panfrost_pool_cleanup(&batch->pool); - panfrost_pool_cleanup(&batch->invisible_pool); + panfrost_pool_cleanup(&batch->pool); + panfrost_pool_cleanup(&batch->invisible_pool); - util_unreference_framebuffer_state(&batch->key); + util_unreference_framebuffer_state(&batch->key); - util_dynarray_fini(&batch->bos); + util_dynarray_fini(&batch->bos); - memset(batch, 0, sizeof(*batch)); - BITSET_CLEAR(ctx->batches.active, batch_idx); + memset(batch, 0, sizeof(*batch)); + BITSET_CLEAR(ctx->batches.active, batch_idx); } -static void -panfrost_batch_submit(struct panfrost_context *ctx, - struct panfrost_batch *batch); +static void panfrost_batch_submit(struct panfrost_context *ctx, + struct panfrost_batch *batch); static struct panfrost_batch * panfrost_get_batch(struct panfrost_context *ctx, const struct pipe_framebuffer_state *key) { - struct panfrost_batch *batch = NULL; + struct panfrost_batch *batch = NULL; - for (unsigned i = 0; i < PAN_MAX_BATCHES; i++) { - if (ctx->batches.slots[i].seqnum && - util_framebuffer_state_equal(&ctx->batches.slots[i].key, key)) { - /* We found a match, increase the seqnum for the LRU - * eviction logic. - */ - ctx->batches.slots[i].seqnum = ++ctx->batches.seqnum; - return &ctx->batches.slots[i]; - } + for (unsigned i = 0; i < PAN_MAX_BATCHES; i++) { + if (ctx->batches.slots[i].seqnum && + util_framebuffer_state_equal(&ctx->batches.slots[i].key, key)) { + /* We found a match, increase the seqnum for the LRU + * eviction logic. + */ + ctx->batches.slots[i].seqnum = ++ctx->batches.seqnum; + return &ctx->batches.slots[i]; + } - if (!batch || batch->seqnum > ctx->batches.slots[i].seqnum) - batch = &ctx->batches.slots[i]; - } + if (!batch || batch->seqnum > ctx->batches.slots[i].seqnum) + batch = &ctx->batches.slots[i]; + } - assert(batch); + assert(batch); - /* The selected slot is used, we need to flush the batch */ - if (batch->seqnum) - panfrost_batch_submit(ctx, batch); + /* The selected slot is used, we need to flush the batch */ + if (batch->seqnum) + panfrost_batch_submit(ctx, batch); - panfrost_batch_init(ctx, key, batch); + panfrost_batch_init(ctx, key, batch); - unsigned batch_idx = panfrost_batch_idx(batch); - BITSET_SET(ctx->batches.active, batch_idx); + unsigned batch_idx = panfrost_batch_idx(batch); + BITSET_SET(ctx->batches.active, batch_idx); - return batch; + return batch; } /* Get the job corresponding to the FBO we're currently rendering into */ @@ -180,152 +182,151 @@ panfrost_get_batch(struct panfrost_context *ctx, struct panfrost_batch * panfrost_get_batch_for_fbo(struct panfrost_context *ctx) { - /* If we already began rendering, use that */ + /* If we already began rendering, use that */ - if (ctx->batch) { - assert(util_framebuffer_state_equal(&ctx->batch->key, - &ctx->pipe_framebuffer)); - return ctx->batch; - } + if (ctx->batch) { + assert(util_framebuffer_state_equal(&ctx->batch->key, + &ctx->pipe_framebuffer)); + return ctx->batch; + } - /* If not, look up the job */ - struct panfrost_batch *batch = panfrost_get_batch(ctx, - &ctx->pipe_framebuffer); + /* If not, look up the job */ + struct panfrost_batch *batch = + panfrost_get_batch(ctx, &ctx->pipe_framebuffer); - /* Set this job as the current FBO job. Will be reset when updating the - * FB state and when submitting or releasing a job. - */ - ctx->batch = batch; - panfrost_dirty_state_all(ctx); - return batch; + /* Set this job as the current FBO job. Will be reset when updating the + * FB state and when submitting or releasing a job. + */ + ctx->batch = batch; + panfrost_dirty_state_all(ctx); + return batch; } struct panfrost_batch * -panfrost_get_fresh_batch_for_fbo(struct panfrost_context *ctx, const char *reason) +panfrost_get_fresh_batch_for_fbo(struct panfrost_context *ctx, + const char *reason) { - struct panfrost_batch *batch; + struct panfrost_batch *batch; - batch = panfrost_get_batch(ctx, &ctx->pipe_framebuffer); - panfrost_dirty_state_all(ctx); + batch = panfrost_get_batch(ctx, &ctx->pipe_framebuffer); + panfrost_dirty_state_all(ctx); - /* We only need to submit and get a fresh batch if there is no - * draw/clear queued. Otherwise we may reuse the batch. */ + /* We only need to submit and get a fresh batch if there is no + * draw/clear queued. Otherwise we may reuse the batch. */ - if (batch->scoreboard.first_job) { - perf_debug_ctx(ctx, "Flushing the current FBO due to: %s", reason); - panfrost_batch_submit(ctx, batch); - batch = panfrost_get_batch(ctx, &ctx->pipe_framebuffer); - } + if (batch->scoreboard.first_job) { + perf_debug_ctx(ctx, "Flushing the current FBO due to: %s", reason); + panfrost_batch_submit(ctx, batch); + batch = panfrost_get_batch(ctx, &ctx->pipe_framebuffer); + } - ctx->batch = batch; - return batch; + ctx->batch = batch; + return batch; } -static bool -panfrost_batch_uses_resource(struct panfrost_batch *batch, - struct panfrost_resource *rsrc); +static bool panfrost_batch_uses_resource(struct panfrost_batch *batch, + struct panfrost_resource *rsrc); static void panfrost_batch_update_access(struct panfrost_batch *batch, struct panfrost_resource *rsrc, bool writes) { - struct panfrost_context *ctx = batch->ctx; - uint32_t batch_idx = panfrost_batch_idx(batch); - struct hash_entry *entry = _mesa_hash_table_search(ctx->writers, rsrc); - struct panfrost_batch *writer = entry ? entry->data : NULL; + struct panfrost_context *ctx = batch->ctx; + uint32_t batch_idx = panfrost_batch_idx(batch); + struct hash_entry *entry = _mesa_hash_table_search(ctx->writers, rsrc); + struct panfrost_batch *writer = entry ? entry->data : NULL; - /* Both reads and writes flush the existing writer */ - if (writer != NULL && writer != batch) - panfrost_batch_submit(ctx, writer); + /* Both reads and writes flush the existing writer */ + if (writer != NULL && writer != batch) + panfrost_batch_submit(ctx, writer); - /* Writes (only) flush readers too */ - if (writes) { - unsigned i; - foreach_batch(ctx, i) { - struct panfrost_batch *batch = &ctx->batches.slots[i]; + /* Writes (only) flush readers too */ + if (writes) { + unsigned i; + foreach_batch(ctx, i) { + struct panfrost_batch *batch = &ctx->batches.slots[i]; - /* Skip the entry if this our batch. */ - if (i == batch_idx) - continue; + /* Skip the entry if this our batch. */ + if (i == batch_idx) + continue; - /* Submit if it's a user */ - if (panfrost_batch_uses_resource(batch, rsrc)) - panfrost_batch_submit(ctx, batch); - } - } + /* Submit if it's a user */ + if (panfrost_batch_uses_resource(batch, rsrc)) + panfrost_batch_submit(ctx, batch); + } + } - if (writes) { - _mesa_hash_table_insert(ctx->writers, rsrc, batch); - } + if (writes) { + _mesa_hash_table_insert(ctx->writers, rsrc, batch); + } } static pan_bo_access * panfrost_batch_get_bo_access(struct panfrost_batch *batch, unsigned handle) { - unsigned size = util_dynarray_num_elements(&batch->bos, pan_bo_access); + unsigned size = util_dynarray_num_elements(&batch->bos, pan_bo_access); - if (handle >= size) { - unsigned grow = handle + 1 - size; + if (handle >= size) { + unsigned grow = handle + 1 - size; - memset(util_dynarray_grow(&batch->bos, pan_bo_access, grow), - 0, grow * sizeof(pan_bo_access)); - } + memset(util_dynarray_grow(&batch->bos, pan_bo_access, grow), 0, + grow * sizeof(pan_bo_access)); + } - return util_dynarray_element(&batch->bos, pan_bo_access, handle); + return util_dynarray_element(&batch->bos, pan_bo_access, handle); } static bool panfrost_batch_uses_resource(struct panfrost_batch *batch, struct panfrost_resource *rsrc) { - /* A resource is used iff its current BO is used */ - uint32_t handle = rsrc->image.data.bo->gem_handle; - unsigned size = util_dynarray_num_elements(&batch->bos, pan_bo_access); + /* A resource is used iff its current BO is used */ + uint32_t handle = rsrc->image.data.bo->gem_handle; + unsigned size = util_dynarray_num_elements(&batch->bos, pan_bo_access); - /* If out of bounds, certainly not used */ - if (handle >= size) - return false; + /* If out of bounds, certainly not used */ + if (handle >= size) + return false; - /* Otherwise check if nonzero access */ - return !!(*util_dynarray_element(&batch->bos, pan_bo_access, handle)); + /* Otherwise check if nonzero access */ + return !!(*util_dynarray_element(&batch->bos, pan_bo_access, handle)); } static void -panfrost_batch_add_bo_old(struct panfrost_batch *batch, - struct panfrost_bo *bo, uint32_t flags) +panfrost_batch_add_bo_old(struct panfrost_batch *batch, struct panfrost_bo *bo, + uint32_t flags) { - if (!bo) - return; + if (!bo) + return; - pan_bo_access *entry = - panfrost_batch_get_bo_access(batch, bo->gem_handle); - pan_bo_access old_flags = *entry; + pan_bo_access *entry = panfrost_batch_get_bo_access(batch, bo->gem_handle); + pan_bo_access old_flags = *entry; - if (!old_flags) { - batch->num_bos++; - panfrost_bo_reference(bo); - } + if (!old_flags) { + batch->num_bos++; + panfrost_bo_reference(bo); + } - if (old_flags == flags) - return; + if (old_flags == flags) + return; - flags |= old_flags; - *entry = flags; + flags |= old_flags; + *entry = flags; } static uint32_t panfrost_access_for_stage(enum pipe_shader_type stage) { - return (stage == PIPE_SHADER_FRAGMENT) ? - PAN_BO_ACCESS_FRAGMENT : PAN_BO_ACCESS_VERTEX_TILER; + return (stage == PIPE_SHADER_FRAGMENT) ? PAN_BO_ACCESS_FRAGMENT + : PAN_BO_ACCESS_VERTEX_TILER; } void -panfrost_batch_add_bo(struct panfrost_batch *batch, - struct panfrost_bo *bo, enum pipe_shader_type stage) +panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo, + enum pipe_shader_type stage) { - panfrost_batch_add_bo_old(batch, bo, PAN_BO_ACCESS_READ | - panfrost_access_for_stage(stage)); + panfrost_batch_add_bo_old( + batch, bo, PAN_BO_ACCESS_READ | panfrost_access_for_stage(stage)); } void @@ -333,31 +334,31 @@ panfrost_batch_read_rsrc(struct panfrost_batch *batch, struct panfrost_resource *rsrc, enum pipe_shader_type stage) { - uint32_t access = PAN_BO_ACCESS_READ | - panfrost_access_for_stage(stage); + uint32_t access = PAN_BO_ACCESS_READ | panfrost_access_for_stage(stage); - panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access); + panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access); - if (rsrc->separate_stencil) - panfrost_batch_add_bo_old(batch, rsrc->separate_stencil->image.data.bo, access); + if (rsrc->separate_stencil) + panfrost_batch_add_bo_old(batch, rsrc->separate_stencil->image.data.bo, + access); - panfrost_batch_update_access(batch, rsrc, false); + panfrost_batch_update_access(batch, rsrc, false); } void panfrost_batch_write_rsrc(struct panfrost_batch *batch, - struct panfrost_resource *rsrc, - enum pipe_shader_type stage) + struct panfrost_resource *rsrc, + enum pipe_shader_type stage) { - uint32_t access = PAN_BO_ACCESS_WRITE | - panfrost_access_for_stage(stage); + uint32_t access = PAN_BO_ACCESS_WRITE | panfrost_access_for_stage(stage); - panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access); + panfrost_batch_add_bo_old(batch, rsrc->image.data.bo, access); - if (rsrc->separate_stencil) - panfrost_batch_add_bo_old(batch, rsrc->separate_stencil->image.data.bo, access); + if (rsrc->separate_stencil) + panfrost_batch_add_bo_old(batch, rsrc->separate_stencil->image.data.bo, + access); - panfrost_batch_update_access(batch, rsrc, true); + panfrost_batch_update_access(batch, rsrc, true); } struct panfrost_bo * @@ -365,324 +366,321 @@ panfrost_batch_create_bo(struct panfrost_batch *batch, size_t size, uint32_t create_flags, enum pipe_shader_type stage, const char *label) { - struct panfrost_bo *bo; + struct panfrost_bo *bo; - bo = panfrost_bo_create(pan_device(batch->ctx->base.screen), size, - create_flags, label); - panfrost_batch_add_bo(batch, bo, stage); + bo = panfrost_bo_create(pan_device(batch->ctx->base.screen), size, + create_flags, label); + panfrost_batch_add_bo(batch, bo, stage); - /* panfrost_batch_add_bo() has retained a reference and - * panfrost_bo_create() initialize the refcnt to 1, so let's - * unreference the BO here so it gets released when the batch is - * destroyed (unless it's retained by someone else in the meantime). - */ - panfrost_bo_unreference(bo); - return bo; + /* panfrost_batch_add_bo() has retained a reference and + * panfrost_bo_create() initialize the refcnt to 1, so let's + * unreference the BO here so it gets released when the batch is + * destroyed (unless it's retained by someone else in the meantime). + */ + panfrost_bo_unreference(bo); + return bo; } struct panfrost_bo * panfrost_batch_get_scratchpad(struct panfrost_batch *batch, - unsigned size_per_thread, - unsigned thread_tls_alloc, - unsigned core_id_range) + unsigned size_per_thread, + unsigned thread_tls_alloc, unsigned core_id_range) { - unsigned size = panfrost_get_total_stack_size(size_per_thread, - thread_tls_alloc, - core_id_range); + unsigned size = panfrost_get_total_stack_size( + size_per_thread, thread_tls_alloc, core_id_range); - if (batch->scratchpad) { - assert(batch->scratchpad->size >= size); - } else { - batch->scratchpad = panfrost_batch_create_bo(batch, size, - PAN_BO_INVISIBLE, - PIPE_SHADER_VERTEX, - "Thread local storage"); + if (batch->scratchpad) { + assert(batch->scratchpad->size >= size); + } else { + batch->scratchpad = + panfrost_batch_create_bo(batch, size, PAN_BO_INVISIBLE, + PIPE_SHADER_VERTEX, "Thread local storage"); - panfrost_batch_add_bo(batch, batch->scratchpad, - PIPE_SHADER_FRAGMENT); - } + panfrost_batch_add_bo(batch, batch->scratchpad, PIPE_SHADER_FRAGMENT); + } - return batch->scratchpad; + return batch->scratchpad; } struct panfrost_bo * -panfrost_batch_get_shared_memory(struct panfrost_batch *batch, - unsigned size, - unsigned workgroup_count) +panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, + unsigned workgroup_count) { - if (batch->shared_memory) { - assert(batch->shared_memory->size >= size); - } else { - batch->shared_memory = panfrost_batch_create_bo(batch, size, - PAN_BO_INVISIBLE, - PIPE_SHADER_VERTEX, - "Workgroup shared memory"); - } + if (batch->shared_memory) { + assert(batch->shared_memory->size >= size); + } else { + batch->shared_memory = panfrost_batch_create_bo( + batch, size, PAN_BO_INVISIBLE, PIPE_SHADER_VERTEX, + "Workgroup shared memory"); + } - return batch->shared_memory; + return batch->shared_memory; } static void panfrost_batch_to_fb_info(const struct panfrost_batch *batch, - struct pan_fb_info *fb, - struct pan_image_view *rts, - struct pan_image_view *zs, - struct pan_image_view *s, + struct pan_fb_info *fb, struct pan_image_view *rts, + struct pan_image_view *zs, struct pan_image_view *s, bool reserve) { - memset(fb, 0, sizeof(*fb)); - memset(rts, 0, sizeof(*rts) * 8); - memset(zs, 0, sizeof(*zs)); - memset(s, 0, sizeof(*s)); + memset(fb, 0, sizeof(*fb)); + memset(rts, 0, sizeof(*rts) * 8); + memset(zs, 0, sizeof(*zs)); + memset(s, 0, sizeof(*s)); - fb->width = batch->key.width; - fb->height = batch->key.height; - fb->extent.minx = batch->minx; - fb->extent.miny = batch->miny; - fb->extent.maxx = batch->maxx - 1; - fb->extent.maxy = batch->maxy - 1; - fb->nr_samples = util_framebuffer_get_num_samples(&batch->key); - fb->rt_count = batch->key.nr_cbufs; - fb->sprite_coord_origin = pan_tristate_get(batch->sprite_coord_origin); - fb->first_provoking_vertex = pan_tristate_get(batch->first_provoking_vertex); + fb->width = batch->key.width; + fb->height = batch->key.height; + fb->extent.minx = batch->minx; + fb->extent.miny = batch->miny; + fb->extent.maxx = batch->maxx - 1; + fb->extent.maxy = batch->maxy - 1; + fb->nr_samples = util_framebuffer_get_num_samples(&batch->key); + fb->rt_count = batch->key.nr_cbufs; + fb->sprite_coord_origin = pan_tristate_get(batch->sprite_coord_origin); + fb->first_provoking_vertex = pan_tristate_get(batch->first_provoking_vertex); - static const unsigned char id_swz[] = { - PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W, - }; + static const unsigned char id_swz[] = { + PIPE_SWIZZLE_X, + PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, + PIPE_SWIZZLE_W, + }; - for (unsigned i = 0; i < fb->rt_count; i++) { - struct pipe_surface *surf = batch->key.cbufs[i]; + for (unsigned i = 0; i < fb->rt_count; i++) { + struct pipe_surface *surf = batch->key.cbufs[i]; - if (!surf) - continue; + if (!surf) + continue; - struct panfrost_resource *prsrc = pan_resource(surf->texture); - unsigned mask = PIPE_CLEAR_COLOR0 << i; + struct panfrost_resource *prsrc = pan_resource(surf->texture); + unsigned mask = PIPE_CLEAR_COLOR0 << i; - if (batch->clear & mask) { - fb->rts[i].clear = true; - memcpy(fb->rts[i].clear_value, batch->clear_color[i], - sizeof((fb->rts[i].clear_value))); - } + if (batch->clear & mask) { + fb->rts[i].clear = true; + memcpy(fb->rts[i].clear_value, batch->clear_color[i], + sizeof((fb->rts[i].clear_value))); + } - fb->rts[i].discard = !reserve && !(batch->resolve & mask); + fb->rts[i].discard = !reserve && !(batch->resolve & mask); - rts[i].format = surf->format; - rts[i].dim = MALI_TEXTURE_DIMENSION_2D; - rts[i].last_level = rts[i].first_level = surf->u.tex.level; - rts[i].first_layer = surf->u.tex.first_layer; - rts[i].last_layer = surf->u.tex.last_layer; - rts[i].image = &prsrc->image; - rts[i].nr_samples = surf->nr_samples ? : MAX2(surf->texture->nr_samples, 1); - memcpy(rts[i].swizzle, id_swz, sizeof(rts[i].swizzle)); - fb->rts[i].crc_valid = &prsrc->valid.crc; - fb->rts[i].view = &rts[i]; + rts[i].format = surf->format; + rts[i].dim = MALI_TEXTURE_DIMENSION_2D; + rts[i].last_level = rts[i].first_level = surf->u.tex.level; + rts[i].first_layer = surf->u.tex.first_layer; + rts[i].last_layer = surf->u.tex.last_layer; + rts[i].image = &prsrc->image; + rts[i].nr_samples = + surf->nr_samples ?: MAX2(surf->texture->nr_samples, 1); + memcpy(rts[i].swizzle, id_swz, sizeof(rts[i].swizzle)); + fb->rts[i].crc_valid = &prsrc->valid.crc; + fb->rts[i].view = &rts[i]; - /* Preload if the RT is read or updated */ - if (!(batch->clear & mask) && - ((batch->read & mask) || - ((batch->draws & mask) && - BITSET_TEST(prsrc->valid.data, fb->rts[i].view->first_level)))) - fb->rts[i].preload = true; + /* Preload if the RT is read or updated */ + if (!(batch->clear & mask) && + ((batch->read & mask) || + ((batch->draws & mask) && + BITSET_TEST(prsrc->valid.data, fb->rts[i].view->first_level)))) + fb->rts[i].preload = true; + } - } + const struct pan_image_view *s_view = NULL, *z_view = NULL; + struct panfrost_resource *z_rsrc = NULL, *s_rsrc = NULL; - const struct pan_image_view *s_view = NULL, *z_view = NULL; - struct panfrost_resource *z_rsrc = NULL, *s_rsrc = NULL; + if (batch->key.zsbuf) { + struct pipe_surface *surf = batch->key.zsbuf; + z_rsrc = pan_resource(surf->texture); - if (batch->key.zsbuf) { - struct pipe_surface *surf = batch->key.zsbuf; - z_rsrc = pan_resource(surf->texture); + zs->format = surf->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT + ? PIPE_FORMAT_Z32_FLOAT + : surf->format; + zs->dim = MALI_TEXTURE_DIMENSION_2D; + zs->last_level = zs->first_level = surf->u.tex.level; + zs->first_layer = surf->u.tex.first_layer; + zs->last_layer = surf->u.tex.last_layer; + zs->image = &z_rsrc->image; + zs->nr_samples = surf->nr_samples ?: MAX2(surf->texture->nr_samples, 1); + memcpy(zs->swizzle, id_swz, sizeof(zs->swizzle)); + fb->zs.view.zs = zs; + z_view = zs; + if (util_format_is_depth_and_stencil(zs->format)) { + s_view = zs; + s_rsrc = z_rsrc; + } - zs->format = surf->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT ? - PIPE_FORMAT_Z32_FLOAT : surf->format; - zs->dim = MALI_TEXTURE_DIMENSION_2D; - zs->last_level = zs->first_level = surf->u.tex.level; - zs->first_layer = surf->u.tex.first_layer; - zs->last_layer = surf->u.tex.last_layer; - zs->image = &z_rsrc->image; - zs->nr_samples = surf->nr_samples ? : MAX2(surf->texture->nr_samples, 1); - memcpy(zs->swizzle, id_swz, sizeof(zs->swizzle)); - fb->zs.view.zs = zs; - z_view = zs; - if (util_format_is_depth_and_stencil(zs->format)) { - s_view = zs; - s_rsrc = z_rsrc; - } + if (z_rsrc->separate_stencil) { + s_rsrc = z_rsrc->separate_stencil; + s->format = PIPE_FORMAT_S8_UINT; + s->dim = MALI_TEXTURE_DIMENSION_2D; + s->last_level = s->first_level = surf->u.tex.level; + s->first_layer = surf->u.tex.first_layer; + s->last_layer = surf->u.tex.last_layer; + s->image = &s_rsrc->image; + s->nr_samples = surf->nr_samples ?: MAX2(surf->texture->nr_samples, 1); + memcpy(s->swizzle, id_swz, sizeof(s->swizzle)); + fb->zs.view.s = s; + s_view = s; + } + } - if (z_rsrc->separate_stencil) { - s_rsrc = z_rsrc->separate_stencil; - s->format = PIPE_FORMAT_S8_UINT; - s->dim = MALI_TEXTURE_DIMENSION_2D; - s->last_level = s->first_level = surf->u.tex.level; - s->first_layer = surf->u.tex.first_layer; - s->last_layer = surf->u.tex.last_layer; - s->image = &s_rsrc->image; - s->nr_samples = surf->nr_samples ? : MAX2(surf->texture->nr_samples, 1); - memcpy(s->swizzle, id_swz, sizeof(s->swizzle)); - fb->zs.view.s = s; - s_view = s; - } - } + if (batch->clear & PIPE_CLEAR_DEPTH) { + fb->zs.clear.z = true; + fb->zs.clear_value.depth = batch->clear_depth; + } - if (batch->clear & PIPE_CLEAR_DEPTH) { - fb->zs.clear.z = true; - fb->zs.clear_value.depth = batch->clear_depth; - } + if (batch->clear & PIPE_CLEAR_STENCIL) { + fb->zs.clear.s = true; + fb->zs.clear_value.stencil = batch->clear_stencil; + } - if (batch->clear & PIPE_CLEAR_STENCIL) { - fb->zs.clear.s = true; - fb->zs.clear_value.stencil = batch->clear_stencil; - } + fb->zs.discard.z = !reserve && !(batch->resolve & PIPE_CLEAR_DEPTH); + fb->zs.discard.s = !reserve && !(batch->resolve & PIPE_CLEAR_STENCIL); - fb->zs.discard.z = !reserve && !(batch->resolve & PIPE_CLEAR_DEPTH); - fb->zs.discard.s = !reserve && !(batch->resolve & PIPE_CLEAR_STENCIL); + if (!fb->zs.clear.z && z_rsrc && + ((batch->read & PIPE_CLEAR_DEPTH) || + ((batch->draws & PIPE_CLEAR_DEPTH) && + BITSET_TEST(z_rsrc->valid.data, z_view->first_level)))) + fb->zs.preload.z = true; - if (!fb->zs.clear.z && z_rsrc && - ((batch->read & PIPE_CLEAR_DEPTH) || - ((batch->draws & PIPE_CLEAR_DEPTH) && - BITSET_TEST(z_rsrc->valid.data, z_view->first_level)))) - fb->zs.preload.z = true; + if (!fb->zs.clear.s && s_rsrc && + ((batch->read & PIPE_CLEAR_STENCIL) || + ((batch->draws & PIPE_CLEAR_STENCIL) && + BITSET_TEST(s_rsrc->valid.data, s_view->first_level)))) + fb->zs.preload.s = true; - if (!fb->zs.clear.s && s_rsrc && - ((batch->read & PIPE_CLEAR_STENCIL) || - ((batch->draws & PIPE_CLEAR_STENCIL) && - BITSET_TEST(s_rsrc->valid.data, s_view->first_level)))) - fb->zs.preload.s = true; + /* Preserve both component if we have a combined ZS view and + * one component needs to be preserved. + */ + if (z_view && z_view == s_view && fb->zs.discard.z != fb->zs.discard.s) { + bool valid = BITSET_TEST(z_rsrc->valid.data, z_view->first_level); - /* Preserve both component if we have a combined ZS view and - * one component needs to be preserved. - */ - if (z_view && z_view == s_view && fb->zs.discard.z != fb->zs.discard.s) { - bool valid = BITSET_TEST(z_rsrc->valid.data, z_view->first_level); - - fb->zs.discard.z = false; - fb->zs.discard.s = false; - fb->zs.preload.z = !fb->zs.clear.z && valid; - fb->zs.preload.s = !fb->zs.clear.s && valid; - } + fb->zs.discard.z = false; + fb->zs.discard.s = false; + fb->zs.preload.z = !fb->zs.clear.z && valid; + fb->zs.preload.s = !fb->zs.clear.s && valid; + } } static int panfrost_batch_submit_ioctl(struct panfrost_batch *batch, - mali_ptr first_job_desc, - uint32_t reqs, - uint32_t in_sync, - uint32_t out_sync) + mali_ptr first_job_desc, uint32_t reqs, + uint32_t in_sync, uint32_t out_sync) { - struct panfrost_context *ctx = batch->ctx; - struct pipe_context *gallium = (struct pipe_context *) ctx; - struct panfrost_device *dev = pan_device(gallium->screen); - struct drm_panfrost_submit submit = {0,}; - uint32_t in_syncs[2]; - uint32_t *bo_handles; - int ret; + struct panfrost_context *ctx = batch->ctx; + struct pipe_context *gallium = (struct pipe_context *)ctx; + struct panfrost_device *dev = pan_device(gallium->screen); + struct drm_panfrost_submit submit = { + 0, + }; + uint32_t in_syncs[2]; + uint32_t *bo_handles; + int ret; - /* If we trace, we always need a syncobj, so make one of our own if we - * weren't given one to use. Remember that we did so, so we can free it - * after we're done but preventing double-frees if we were given a - * syncobj */ + /* If we trace, we always need a syncobj, so make one of our own if we + * weren't given one to use. Remember that we did so, so we can free it + * after we're done but preventing double-frees if we were given a + * syncobj */ - if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) - out_sync = ctx->syncobj; + if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) + out_sync = ctx->syncobj; - submit.out_sync = out_sync; - submit.jc = first_job_desc; - submit.requirements = reqs; + submit.out_sync = out_sync; + submit.jc = first_job_desc; + submit.requirements = reqs; - if (in_sync) - in_syncs[submit.in_sync_count++] = in_sync; + if (in_sync) + in_syncs[submit.in_sync_count++] = in_sync; - if (ctx->in_sync_fd >= 0) { - ret = drmSyncobjImportSyncFile(dev->fd, ctx->in_sync_obj, - ctx->in_sync_fd); - assert(!ret); + if (ctx->in_sync_fd >= 0) { + ret = + drmSyncobjImportSyncFile(dev->fd, ctx->in_sync_obj, ctx->in_sync_fd); + assert(!ret); - in_syncs[submit.in_sync_count++] = ctx->in_sync_obj; - close(ctx->in_sync_fd); - ctx->in_sync_fd = -1; - } + in_syncs[submit.in_sync_count++] = ctx->in_sync_obj; + close(ctx->in_sync_fd); + ctx->in_sync_fd = -1; + } - if (submit.in_sync_count) - submit.in_syncs = (uintptr_t)in_syncs; + if (submit.in_sync_count) + submit.in_syncs = (uintptr_t)in_syncs; - bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) + - panfrost_pool_num_bos(&batch->invisible_pool) + - batch->num_bos + 2, - sizeof(*bo_handles)); - assert(bo_handles); + bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) + + panfrost_pool_num_bos(&batch->invisible_pool) + + batch->num_bos + 2, + sizeof(*bo_handles)); + assert(bo_handles); - pan_bo_access *flags = util_dynarray_begin(&batch->bos); - unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access); + pan_bo_access *flags = util_dynarray_begin(&batch->bos); + unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access); - for (int i = 0; i < end_bo; ++i) { - if (!flags[i]) - continue; + for (int i = 0; i < end_bo; ++i) { + if (!flags[i]) + continue; - assert(submit.bo_handle_count < batch->num_bos); - bo_handles[submit.bo_handle_count++] = i; + assert(submit.bo_handle_count < batch->num_bos); + bo_handles[submit.bo_handle_count++] = i; - /* Update the BO access flags so that panfrost_bo_wait() knows - * about all pending accesses. - * We only keep the READ/WRITE info since this is all the BO - * wait logic cares about. - * We also preserve existing flags as this batch might not - * be the first one to access the BO. - */ - struct panfrost_bo *bo = pan_lookup_bo(dev, i); + /* Update the BO access flags so that panfrost_bo_wait() knows + * about all pending accesses. + * We only keep the READ/WRITE info since this is all the BO + * wait logic cares about. + * We also preserve existing flags as this batch might not + * be the first one to access the BO. + */ + struct panfrost_bo *bo = pan_lookup_bo(dev, i); - bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW); - } + bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW); + } - panfrost_pool_get_bo_handles(&batch->pool, bo_handles + submit.bo_handle_count); - submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool); - panfrost_pool_get_bo_handles(&batch->invisible_pool, bo_handles + submit.bo_handle_count); - submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool); + panfrost_pool_get_bo_handles(&batch->pool, + bo_handles + submit.bo_handle_count); + submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool); + panfrost_pool_get_bo_handles(&batch->invisible_pool, + bo_handles + submit.bo_handle_count); + submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool); - /* Add the tiler heap to the list of accessed BOs if the batch has at - * least one tiler job. Tiler heap is written by tiler jobs and read - * by fragment jobs (the polygon list is coming from this heap). - */ - if (batch->scoreboard.first_tiler) - bo_handles[submit.bo_handle_count++] = dev->tiler_heap->gem_handle; + /* Add the tiler heap to the list of accessed BOs if the batch has at + * least one tiler job. Tiler heap is written by tiler jobs and read + * by fragment jobs (the polygon list is coming from this heap). + */ + if (batch->scoreboard.first_tiler) + bo_handles[submit.bo_handle_count++] = dev->tiler_heap->gem_handle; - /* Always used on Bifrost, occassionally used on Midgard */ - bo_handles[submit.bo_handle_count++] = dev->sample_positions->gem_handle; + /* Always used on Bifrost, occassionally used on Midgard */ + bo_handles[submit.bo_handle_count++] = dev->sample_positions->gem_handle; - submit.bo_handles = (u64) (uintptr_t) bo_handles; - if (ctx->is_noop) - ret = 0; - else - ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit); - free(bo_handles); + submit.bo_handles = (u64)(uintptr_t)bo_handles; + if (ctx->is_noop) + ret = 0; + else + ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit); + free(bo_handles); - if (ret) - return errno; + if (ret) + return errno; - /* Trace the job if we're doing that */ - if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { - /* Wait so we can get errors reported back */ - drmSyncobjWait(dev->fd, &out_sync, 1, - INT64_MAX, 0, NULL); + /* Trace the job if we're doing that */ + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { + /* Wait so we can get errors reported back */ + drmSyncobjWait(dev->fd, &out_sync, 1, INT64_MAX, 0, NULL); - if (dev->debug & PAN_DBG_TRACE) - pandecode_jc(submit.jc, dev->gpu_id); + if (dev->debug & PAN_DBG_TRACE) + pandecode_jc(submit.jc, dev->gpu_id); - if (dev->debug & PAN_DBG_DUMP) - pandecode_dump_mappings(); + if (dev->debug & PAN_DBG_DUMP) + pandecode_dump_mappings(); - /* Jobs won't be complete if blackhole rendering, that's ok */ - if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC) - pandecode_abort_on_fault(submit.jc, dev->gpu_id); - } + /* Jobs won't be complete if blackhole rendering, that's ok */ + if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC) + pandecode_abort_on_fault(submit.jc, dev->gpu_id); + } - return 0; + return 0; } static bool panfrost_has_fragment_job(struct panfrost_batch *batch) { - return batch->scoreboard.first_tiler || batch->clear; + return batch->scoreboard.first_tiler || batch->clear; } /* Submit both vertex/tiler and fragment jobs for a batch, possibly with an @@ -691,141 +689,137 @@ panfrost_has_fragment_job(struct panfrost_batch *batch) static int panfrost_batch_submit_jobs(struct panfrost_batch *batch, - const struct pan_fb_info *fb, - uint32_t in_sync, uint32_t out_sync) + const struct pan_fb_info *fb, uint32_t in_sync, + uint32_t out_sync) { - struct pipe_screen *pscreen = batch->ctx->base.screen; - struct panfrost_screen *screen = pan_screen(pscreen); - struct panfrost_device *dev = pan_device(pscreen); - bool has_draws = batch->scoreboard.first_job; - bool has_tiler = batch->scoreboard.first_tiler; - bool has_frag = panfrost_has_fragment_job(batch); - int ret = 0; + struct pipe_screen *pscreen = batch->ctx->base.screen; + struct panfrost_screen *screen = pan_screen(pscreen); + struct panfrost_device *dev = pan_device(pscreen); + bool has_draws = batch->scoreboard.first_job; + bool has_tiler = batch->scoreboard.first_tiler; + bool has_frag = panfrost_has_fragment_job(batch); + int ret = 0; - /* Take the submit lock to make sure no tiler jobs from other context - * are inserted between our tiler and fragment jobs, failing to do that - * might result in tiler heap corruption. - */ - if (has_tiler) - pthread_mutex_lock(&dev->submit_lock); + /* Take the submit lock to make sure no tiler jobs from other context + * are inserted between our tiler and fragment jobs, failing to do that + * might result in tiler heap corruption. + */ + if (has_tiler) + pthread_mutex_lock(&dev->submit_lock); - if (has_draws) { - ret = panfrost_batch_submit_ioctl(batch, batch->scoreboard.first_job, - 0, in_sync, has_frag ? 0 : out_sync); + if (has_draws) { + ret = panfrost_batch_submit_ioctl(batch, batch->scoreboard.first_job, 0, + in_sync, has_frag ? 0 : out_sync); - if (ret) - goto done; - } + if (ret) + goto done; + } - if (has_frag) { - mali_ptr fragjob = screen->vtbl.emit_fragment_job(batch, fb); - ret = panfrost_batch_submit_ioctl(batch, fragjob, - PANFROST_JD_REQ_FS, 0, - out_sync); - if (ret) - goto done; - } + if (has_frag) { + mali_ptr fragjob = screen->vtbl.emit_fragment_job(batch, fb); + ret = panfrost_batch_submit_ioctl(batch, fragjob, PANFROST_JD_REQ_FS, 0, + out_sync); + if (ret) + goto done; + } done: - if (has_tiler) - pthread_mutex_unlock(&dev->submit_lock); + if (has_tiler) + pthread_mutex_unlock(&dev->submit_lock); - return ret; + return ret; } static void panfrost_emit_tile_map(struct panfrost_batch *batch, struct pan_fb_info *fb) { - if (batch->key.nr_cbufs < 1 || !batch->key.cbufs[0]) - return; + if (batch->key.nr_cbufs < 1 || !batch->key.cbufs[0]) + return; - struct pipe_surface *surf = batch->key.cbufs[0]; - struct panfrost_resource *pres = surf ? pan_resource(surf->texture) : NULL; + struct pipe_surface *surf = batch->key.cbufs[0]; + struct panfrost_resource *pres = surf ? pan_resource(surf->texture) : NULL; - if (pres && pres->damage.tile_map.enable) { - fb->tile_map.base = - pan_pool_upload_aligned(&batch->pool.base, - pres->damage.tile_map.data, - pres->damage.tile_map.size, - 64); - fb->tile_map.stride = pres->damage.tile_map.stride; - } + if (pres && pres->damage.tile_map.enable) { + fb->tile_map.base = + pan_pool_upload_aligned(&batch->pool.base, pres->damage.tile_map.data, + pres->damage.tile_map.size, 64); + fb->tile_map.stride = pres->damage.tile_map.stride; + } } static void panfrost_batch_submit(struct panfrost_context *ctx, struct panfrost_batch *batch) { - struct pipe_screen *pscreen = ctx->base.screen; - struct panfrost_screen *screen = pan_screen(pscreen); - int ret; + struct pipe_screen *pscreen = ctx->base.screen; + struct panfrost_screen *screen = pan_screen(pscreen); + int ret; - /* Nothing to do! */ - if (!batch->scoreboard.first_job && !batch->clear) - goto out; + /* Nothing to do! */ + if (!batch->scoreboard.first_job && !batch->clear) + goto out; - if (batch->key.zsbuf && panfrost_has_fragment_job(batch)) { - struct pipe_surface *surf = batch->key.zsbuf; - struct panfrost_resource *z_rsrc = pan_resource(surf->texture); + if (batch->key.zsbuf && panfrost_has_fragment_job(batch)) { + struct pipe_surface *surf = batch->key.zsbuf; + struct panfrost_resource *z_rsrc = pan_resource(surf->texture); - /* Shared depth/stencil resources are not supported, and would - * break this optimisation. */ - assert(!(z_rsrc->base.bind & PAN_BIND_SHARED_MASK)); + /* Shared depth/stencil resources are not supported, and would + * break this optimisation. */ + assert(!(z_rsrc->base.bind & PAN_BIND_SHARED_MASK)); - if (batch->clear & PIPE_CLEAR_STENCIL) { - z_rsrc->stencil_value = batch->clear_stencil; - z_rsrc->constant_stencil = true; - } else if (z_rsrc->constant_stencil) { - batch->clear_stencil = z_rsrc->stencil_value; - batch->clear |= PIPE_CLEAR_STENCIL; - } + if (batch->clear & PIPE_CLEAR_STENCIL) { + z_rsrc->stencil_value = batch->clear_stencil; + z_rsrc->constant_stencil = true; + } else if (z_rsrc->constant_stencil) { + batch->clear_stencil = z_rsrc->stencil_value; + batch->clear |= PIPE_CLEAR_STENCIL; + } - if (batch->draws & PIPE_CLEAR_STENCIL) - z_rsrc->constant_stencil = false; - } + if (batch->draws & PIPE_CLEAR_STENCIL) + z_rsrc->constant_stencil = false; + } - struct pan_fb_info fb; - struct pan_image_view rts[8], zs, s; + struct pan_fb_info fb; + struct pan_image_view rts[8], zs, s; - panfrost_batch_to_fb_info(batch, &fb, rts, &zs, &s, false); + panfrost_batch_to_fb_info(batch, &fb, rts, &zs, &s, false); - screen->vtbl.preload(batch, &fb); - screen->vtbl.init_polygon_list(batch); + screen->vtbl.preload(batch, &fb); + screen->vtbl.init_polygon_list(batch); - /* Now that all draws are in, we can finally prepare the - * FBD for the batch (if there is one). */ + /* Now that all draws are in, we can finally prepare the + * FBD for the batch (if there is one). */ - screen->vtbl.emit_tls(batch); - panfrost_emit_tile_map(batch, &fb); + screen->vtbl.emit_tls(batch); + panfrost_emit_tile_map(batch, &fb); - if (batch->scoreboard.first_tiler || batch->clear) - screen->vtbl.emit_fbd(batch, &fb); + if (batch->scoreboard.first_tiler || batch->clear) + screen->vtbl.emit_fbd(batch, &fb); - ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj); + ret = panfrost_batch_submit_jobs(batch, &fb, 0, ctx->syncobj); - if (ret) - fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret); + if (ret) + fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret); - /* We must reset the damage info of our render targets here even - * though a damage reset normally happens when the DRI layer swaps - * buffers. That's because there can be implicit flushes the GL - * app is not aware of, and those might impact the damage region: if - * part of the damaged portion is drawn during those implicit flushes, - * you have to reload those areas before next draws are pushed, and - * since the driver can't easily know what's been modified by the draws - * it flushed, the easiest solution is to reload everything. - */ - for (unsigned i = 0; i < batch->key.nr_cbufs; i++) { - if (!batch->key.cbufs[i]) - continue; + /* We must reset the damage info of our render targets here even + * though a damage reset normally happens when the DRI layer swaps + * buffers. That's because there can be implicit flushes the GL + * app is not aware of, and those might impact the damage region: if + * part of the damaged portion is drawn during those implicit flushes, + * you have to reload those areas before next draws are pushed, and + * since the driver can't easily know what's been modified by the draws + * it flushed, the easiest solution is to reload everything. + */ + for (unsigned i = 0; i < batch->key.nr_cbufs; i++) { + if (!batch->key.cbufs[i]) + continue; - panfrost_resource_set_damage_region(ctx->base.screen, - batch->key.cbufs[i]->texture, - 0, NULL); - } + panfrost_resource_set_damage_region( + ctx->base.screen, batch->key.cbufs[i]->texture, 0, NULL); + } out: - panfrost_batch_cleanup(ctx, batch); + panfrost_batch_cleanup(ctx, batch); } /* Submit all batches */ @@ -833,30 +827,29 @@ out: void panfrost_flush_all_batches(struct panfrost_context *ctx, const char *reason) { - struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - panfrost_batch_submit(ctx, batch); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + panfrost_batch_submit(ctx, batch); - for (unsigned i = 0; i < PAN_MAX_BATCHES; i++) { - if (ctx->batches.slots[i].seqnum) { - if (reason) - perf_debug_ctx(ctx, "Flushing everything due to: %s", reason); + for (unsigned i = 0; i < PAN_MAX_BATCHES; i++) { + if (ctx->batches.slots[i].seqnum) { + if (reason) + perf_debug_ctx(ctx, "Flushing everything due to: %s", reason); - panfrost_batch_submit(ctx, &ctx->batches.slots[i]); - } - } + panfrost_batch_submit(ctx, &ctx->batches.slots[i]); + } + } } void panfrost_flush_writer(struct panfrost_context *ctx, - struct panfrost_resource *rsrc, - const char *reason) + struct panfrost_resource *rsrc, const char *reason) { - struct hash_entry *entry = _mesa_hash_table_search(ctx->writers, rsrc); + struct hash_entry *entry = _mesa_hash_table_search(ctx->writers, rsrc); - if (entry) { - perf_debug_ctx(ctx, "Flushing writer due to: %s", reason); - panfrost_batch_submit(ctx, entry->data); - } + if (entry) { + perf_debug_ctx(ctx, "Flushing writer due to: %s", reason); + panfrost_batch_submit(ctx, entry->data); + } } void @@ -864,106 +857,103 @@ panfrost_flush_batches_accessing_rsrc(struct panfrost_context *ctx, struct panfrost_resource *rsrc, const char *reason) { - unsigned i; - foreach_batch(ctx, i) { - struct panfrost_batch *batch = &ctx->batches.slots[i]; + unsigned i; + foreach_batch(ctx, i) { + struct panfrost_batch *batch = &ctx->batches.slots[i]; - if (!panfrost_batch_uses_resource(batch, rsrc)) - continue; + if (!panfrost_batch_uses_resource(batch, rsrc)) + continue; - perf_debug_ctx(ctx, "Flushing user due to: %s", reason); - panfrost_batch_submit(ctx, batch); - } + perf_debug_ctx(ctx, "Flushing user due to: %s", reason); + panfrost_batch_submit(ctx, batch); + } } bool panfrost_any_batch_reads_rsrc(struct panfrost_context *ctx, struct panfrost_resource *rsrc) { - unsigned i; - foreach_batch(ctx, i) { - struct panfrost_batch *batch = &ctx->batches.slots[i]; + unsigned i; + foreach_batch(ctx, i) { + struct panfrost_batch *batch = &ctx->batches.slots[i]; - if (panfrost_batch_uses_resource(batch, rsrc)) - return true; - } + if (panfrost_batch_uses_resource(batch, rsrc)) + return true; + } - return false; + return false; } bool panfrost_any_batch_writes_rsrc(struct panfrost_context *ctx, struct panfrost_resource *rsrc) { - return _mesa_hash_table_search(ctx->writers, rsrc) != NULL; + return _mesa_hash_table_search(ctx->writers, rsrc) != NULL; } void panfrost_batch_adjust_stack_size(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) { - struct panfrost_compiled_shader *ss = ctx->prog[i]; + for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) { + struct panfrost_compiled_shader *ss = ctx->prog[i]; - if (!ss) - continue; + if (!ss) + continue; - batch->stack_size = MAX2(batch->stack_size, ss->info.tls_size); - } + batch->stack_size = MAX2(batch->stack_size, ss->info.tls_size); + } } void -panfrost_batch_clear(struct panfrost_batch *batch, - unsigned buffers, - const union pipe_color_union *color, - double depth, unsigned stencil) +panfrost_batch_clear(struct panfrost_batch *batch, unsigned buffers, + const union pipe_color_union *color, double depth, + unsigned stencil) { - struct panfrost_context *ctx = batch->ctx; + struct panfrost_context *ctx = batch->ctx; - if (buffers & PIPE_CLEAR_COLOR) { - for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) { - if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) - continue; + if (buffers & PIPE_CLEAR_COLOR) { + for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) { + if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) + continue; - enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format; - pan_pack_color(batch->clear_color[i], color, format, false); - } - } + enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format; + pan_pack_color(batch->clear_color[i], color, format, false); + } + } - if (buffers & PIPE_CLEAR_DEPTH) { - batch->clear_depth = depth; - } + if (buffers & PIPE_CLEAR_DEPTH) { + batch->clear_depth = depth; + } - if (buffers & PIPE_CLEAR_STENCIL) { - batch->clear_stencil = stencil; - } + if (buffers & PIPE_CLEAR_STENCIL) { + batch->clear_stencil = stencil; + } - batch->clear |= buffers; - batch->resolve |= buffers; + batch->clear |= buffers; + batch->resolve |= buffers; - /* Clearing affects the entire framebuffer (by definition -- this is - * the Gallium clear callback, which clears the whole framebuffer. If - * the scissor test were enabled from the GL side, the gallium frontend - * would emit a quad instead and we wouldn't go down this code path) */ + /* Clearing affects the entire framebuffer (by definition -- this is + * the Gallium clear callback, which clears the whole framebuffer. If + * the scissor test were enabled from the GL side, the gallium frontend + * would emit a quad instead and we wouldn't go down this code path) */ - panfrost_batch_union_scissor(batch, 0, 0, - ctx->pipe_framebuffer.width, - ctx->pipe_framebuffer.height); + panfrost_batch_union_scissor(batch, 0, 0, ctx->pipe_framebuffer.width, + ctx->pipe_framebuffer.height); } /* Given a new bounding rectangle (scissor), let the job cover the union of the * new and old bounding rectangles */ void -panfrost_batch_union_scissor(struct panfrost_batch *batch, - unsigned minx, unsigned miny, - unsigned maxx, unsigned maxy) +panfrost_batch_union_scissor(struct panfrost_batch *batch, unsigned minx, + unsigned miny, unsigned maxx, unsigned maxy) { - batch->minx = MIN2(batch->minx, minx); - batch->miny = MIN2(batch->miny, miny); - batch->maxx = MAX2(batch->maxx, maxx); - batch->maxy = MAX2(batch->maxy, maxy); + batch->minx = MIN2(batch->minx, minx); + batch->miny = MIN2(batch->miny, miny); + batch->maxx = MAX2(batch->maxx, maxx); + batch->maxy = MAX2(batch->maxy, maxy); } /** @@ -976,10 +966,9 @@ panfrost_batch_union_scissor(struct panfrost_batch *batch, bool panfrost_batch_skip_rasterization(struct panfrost_batch *batch) { - struct panfrost_context *ctx = batch->ctx; - struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer; + struct panfrost_context *ctx = batch->ctx; + struct pipe_rasterizer_state *rast = (void *)ctx->rasterizer; - return (rast->rasterizer_discard || - batch->scissor_culls_everything || - !batch->rsd[PIPE_SHADER_VERTEX]); + return (rast->rasterizer_discard || batch->scissor_culls_everything || + !batch->rsd[PIPE_SHADER_VERTEX]); } diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h index ed7d970a9b3..49953221f3d 100644 --- a/src/gallium/drivers/panfrost/pan_job.h +++ b/src/gallium/drivers/panfrost/pan_job.h @@ -26,8 +26,8 @@ #ifndef __PAN_JOB_H__ #define __PAN_JOB_H__ -#include "util/u_dynarray.h" #include "pipe/p_state.h" +#include "util/u_dynarray.h" #include "pan_cs.h" #include "pan_mempool.h" #include "pan_resource.h" @@ -39,11 +39,11 @@ * error. The getter needs to be used instead. */ struct pan_tristate { - enum { - PAN_TRISTATE_DONTCARE, - PAN_TRISTATE_FALSE, - PAN_TRISTATE_TRUE, - } v; + enum { + PAN_TRISTATE_DONTCARE, + PAN_TRISTATE_FALSE, + PAN_TRISTATE_TRUE, + } v; }; /* @@ -53,20 +53,20 @@ struct pan_tristate { static bool pan_tristate_set(struct pan_tristate *state, bool value) { - switch (state->v) { - case PAN_TRISTATE_DONTCARE: - state->v = value ? PAN_TRISTATE_TRUE : PAN_TRISTATE_FALSE; - return true; + switch (state->v) { + case PAN_TRISTATE_DONTCARE: + state->v = value ? PAN_TRISTATE_TRUE : PAN_TRISTATE_FALSE; + return true; - case PAN_TRISTATE_FALSE: - return (value == false); + case PAN_TRISTATE_FALSE: + return (value == false); - case PAN_TRISTATE_TRUE: - return (value == true); + case PAN_TRISTATE_TRUE: + return (value == true); - default: - unreachable("Invalid tristate value"); - } + default: + unreachable("Invalid tristate value"); + } } /* @@ -76,189 +76,179 @@ pan_tristate_set(struct pan_tristate *state, bool value) static bool pan_tristate_get(struct pan_tristate state) { - return (state.v == PAN_TRISTATE_TRUE); + return (state.v == PAN_TRISTATE_TRUE); } /* A panfrost_batch corresponds to a bound FBO we're rendering to, * collecting over multiple draws. */ struct panfrost_batch { - struct panfrost_context *ctx; - struct pipe_framebuffer_state key; + struct panfrost_context *ctx; + struct pipe_framebuffer_state key; - /* Sequence number used to implement LRU eviction when all batch slots are used */ - uint64_t seqnum; + /* Sequence number used to implement LRU eviction when all batch slots are + * used */ + uint64_t seqnum; - /* Buffers cleared (PIPE_CLEAR_* bitmask) */ - unsigned clear; + /* Buffers cleared (PIPE_CLEAR_* bitmask) */ + unsigned clear; - /* Buffers drawn */ - unsigned draws; + /* Buffers drawn */ + unsigned draws; - /* Buffers read */ - unsigned read; + /* Buffers read */ + unsigned read; - /* Buffers needing resolve to memory */ - unsigned resolve; + /* Buffers needing resolve to memory */ + unsigned resolve; - /* Packed clear values, indexed by both render target as well as word. - * Essentially, a single pixel is packed, with some padding to bring it - * up to a 32-bit interval; that pixel is then duplicated over to fill - * all 16-bytes */ + /* Packed clear values, indexed by both render target as well as word. + * Essentially, a single pixel is packed, with some padding to bring it + * up to a 32-bit interval; that pixel is then duplicated over to fill + * all 16-bytes */ - uint32_t clear_color[PIPE_MAX_COLOR_BUFS][4]; - float clear_depth; - unsigned clear_stencil; + uint32_t clear_color[PIPE_MAX_COLOR_BUFS][4]; + float clear_depth; + unsigned clear_stencil; - /* Amount of thread local storage required per thread */ - unsigned stack_size; + /* Amount of thread local storage required per thread */ + unsigned stack_size; - /* Amount of shared memory needed per workgroup (for compute) */ - unsigned shared_size; + /* Amount of shared memory needed per workgroup (for compute) */ + unsigned shared_size; - /* The bounding box covered by this job, taking scissors into account. - * Basically, the bounding box we have to run fragment shaders for */ + /* The bounding box covered by this job, taking scissors into account. + * Basically, the bounding box we have to run fragment shaders for */ - unsigned minx, miny; - unsigned maxx, maxy; + unsigned minx, miny; + unsigned maxx, maxy; - /* Acts as a rasterizer discard */ - bool scissor_culls_everything; + /* Acts as a rasterizer discard */ + bool scissor_culls_everything; - /* BOs referenced not in the pool */ - unsigned num_bos; - struct util_dynarray bos; + /* BOs referenced not in the pool */ + unsigned num_bos; + struct util_dynarray bos; - /* Pool owned by this batch (released when the batch is released) used for temporary descriptors */ - struct panfrost_pool pool; + /* Pool owned by this batch (released when the batch is released) used for + * temporary descriptors */ + struct panfrost_pool pool; - /* Pool also owned by this batch that is not CPU mapped (created as - * INVISIBLE) used for private GPU-internal structures, particularly - * varyings */ - struct panfrost_pool invisible_pool; + /* Pool also owned by this batch that is not CPU mapped (created as + * INVISIBLE) used for private GPU-internal structures, particularly + * varyings */ + struct panfrost_pool invisible_pool; - /* Job scoreboarding state */ - struct pan_scoreboard scoreboard; + /* Job scoreboarding state */ + struct pan_scoreboard scoreboard; - /* Polygon list bound to the batch, or NULL if none bound yet */ - struct panfrost_bo *polygon_list; + /* Polygon list bound to the batch, or NULL if none bound yet */ + struct panfrost_bo *polygon_list; - /* Scratchpad BO bound to the batch, or NULL if none bound yet */ - struct panfrost_bo *scratchpad; + /* Scratchpad BO bound to the batch, or NULL if none bound yet */ + struct panfrost_bo *scratchpad; - /* Shared memory BO bound to the batch, or NULL if none bound yet */ - struct panfrost_bo *shared_memory; + /* Shared memory BO bound to the batch, or NULL if none bound yet */ + struct panfrost_bo *shared_memory; - /* Framebuffer descriptor. */ - struct panfrost_ptr framebuffer; + /* Framebuffer descriptor. */ + struct panfrost_ptr framebuffer; - /* Thread local storage descriptor. */ - struct panfrost_ptr tls; + /* Thread local storage descriptor. */ + struct panfrost_ptr tls; - /* Tiler context */ - struct pan_tiler_context tiler_ctx; + /* Tiler context */ + struct pan_tiler_context tiler_ctx; - /* Keep the num_work_groups sysval around for indirect dispatch */ - mali_ptr num_wg_sysval[3]; + /* Keep the num_work_groups sysval around for indirect dispatch */ + mali_ptr num_wg_sysval[3]; - /* Cached descriptors */ - mali_ptr viewport; - mali_ptr rsd[PIPE_SHADER_TYPES]; - mali_ptr textures[PIPE_SHADER_TYPES]; - mali_ptr samplers[PIPE_SHADER_TYPES]; - mali_ptr attribs[PIPE_SHADER_TYPES]; - mali_ptr attrib_bufs[PIPE_SHADER_TYPES]; - mali_ptr uniform_buffers[PIPE_SHADER_TYPES]; - mali_ptr push_uniforms[PIPE_SHADER_TYPES]; - mali_ptr depth_stencil; - mali_ptr blend; + /* Cached descriptors */ + mali_ptr viewport; + mali_ptr rsd[PIPE_SHADER_TYPES]; + mali_ptr textures[PIPE_SHADER_TYPES]; + mali_ptr samplers[PIPE_SHADER_TYPES]; + mali_ptr attribs[PIPE_SHADER_TYPES]; + mali_ptr attrib_bufs[PIPE_SHADER_TYPES]; + mali_ptr uniform_buffers[PIPE_SHADER_TYPES]; + mali_ptr push_uniforms[PIPE_SHADER_TYPES]; + mali_ptr depth_stencil; + mali_ptr blend; - /* Valhall: struct mali_scissor_packed */ - unsigned scissor[2]; - float minimum_z, maximum_z; + /* Valhall: struct mali_scissor_packed */ + unsigned scissor[2]; + float minimum_z, maximum_z; - /* Used on Valhall only. Midgard includes attributes in-band with - * attributes, wildly enough. - */ - mali_ptr images[PIPE_SHADER_TYPES]; + /* Used on Valhall only. Midgard includes attributes in-band with + * attributes, wildly enough. + */ + mali_ptr images[PIPE_SHADER_TYPES]; - /* On Valhall, these are properties of the batch. On Bifrost, they are - * per draw. - */ - struct pan_tristate sprite_coord_origin; - struct pan_tristate first_provoking_vertex; + /* On Valhall, these are properties of the batch. On Bifrost, they are + * per draw. + */ + struct pan_tristate sprite_coord_origin; + struct pan_tristate first_provoking_vertex; }; /* Functions for managing the above */ -struct panfrost_batch * -panfrost_get_batch_for_fbo(struct panfrost_context *ctx); +struct panfrost_batch *panfrost_get_batch_for_fbo(struct panfrost_context *ctx); struct panfrost_batch * -panfrost_get_fresh_batch_for_fbo(struct panfrost_context *ctx, const char *reason); +panfrost_get_fresh_batch_for_fbo(struct panfrost_context *ctx, + const char *reason); -void -panfrost_batch_add_bo(struct panfrost_batch *batch, - struct panfrost_bo *bo, - enum pipe_shader_type stage); +void panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo, + enum pipe_shader_type stage); -void -panfrost_batch_read_rsrc(struct panfrost_batch *batch, - struct panfrost_resource *rsrc, - enum pipe_shader_type stage); +void panfrost_batch_read_rsrc(struct panfrost_batch *batch, + struct panfrost_resource *rsrc, + enum pipe_shader_type stage); -void -panfrost_batch_write_rsrc(struct panfrost_batch *batch, - struct panfrost_resource *rsrc, - enum pipe_shader_type stage); +void panfrost_batch_write_rsrc(struct panfrost_batch *batch, + struct panfrost_resource *rsrc, + enum pipe_shader_type stage); -bool -panfrost_any_batch_reads_rsrc(struct panfrost_context *ctx, - struct panfrost_resource *rsrc); +bool panfrost_any_batch_reads_rsrc(struct panfrost_context *ctx, + struct panfrost_resource *rsrc); -bool -panfrost_any_batch_writes_rsrc(struct panfrost_context *ctx, - struct panfrost_resource *rsrc); +bool panfrost_any_batch_writes_rsrc(struct panfrost_context *ctx, + struct panfrost_resource *rsrc); + +struct panfrost_bo *panfrost_batch_create_bo(struct panfrost_batch *batch, + size_t size, uint32_t create_flags, + enum pipe_shader_type stage, + const char *label); + +void panfrost_flush_all_batches(struct panfrost_context *ctx, + const char *reason); + +void panfrost_flush_batches_accessing_rsrc(struct panfrost_context *ctx, + struct panfrost_resource *rsrc, + const char *reason); + +void panfrost_flush_writer(struct panfrost_context *ctx, + struct panfrost_resource *rsrc, const char *reason); + +void panfrost_batch_adjust_stack_size(struct panfrost_batch *batch); + +struct panfrost_bo *panfrost_batch_get_scratchpad(struct panfrost_batch *batch, + unsigned size, + unsigned thread_tls_alloc, + unsigned core_id_range); struct panfrost_bo * -panfrost_batch_create_bo(struct panfrost_batch *batch, size_t size, - uint32_t create_flags, enum pipe_shader_type stage, - const char *label); +panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, + unsigned workgroup_count); -void -panfrost_flush_all_batches(struct panfrost_context *ctx, const char *reason); +void panfrost_batch_clear(struct panfrost_batch *batch, unsigned buffers, + const union pipe_color_union *color, double depth, + unsigned stencil); -void -panfrost_flush_batches_accessing_rsrc(struct panfrost_context *ctx, - struct panfrost_resource *rsrc, - const char *reason); +void panfrost_batch_union_scissor(struct panfrost_batch *batch, unsigned minx, + unsigned miny, unsigned maxx, unsigned maxy); -void -panfrost_flush_writer(struct panfrost_context *ctx, - struct panfrost_resource *rsrc, - const char *reason); - -void -panfrost_batch_adjust_stack_size(struct panfrost_batch *batch); - -struct panfrost_bo * -panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_id_range); - -struct panfrost_bo * -panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, unsigned workgroup_count); - -void -panfrost_batch_clear(struct panfrost_batch *batch, - unsigned buffers, - const union pipe_color_union *color, - double depth, unsigned stencil); - -void -panfrost_batch_union_scissor(struct panfrost_batch *batch, - unsigned minx, unsigned miny, - unsigned maxx, unsigned maxy); - -bool -panfrost_batch_skip_rasterization(struct panfrost_batch *batch); +bool panfrost_batch_skip_rasterization(struct panfrost_batch *batch); #endif diff --git a/src/gallium/drivers/panfrost/pan_mempool.c b/src/gallium/drivers/panfrost/pan_mempool.c index 1757e99b87c..89797cc3935 100644 --- a/src/gallium/drivers/panfrost/pan_mempool.c +++ b/src/gallium/drivers/panfrost/pan_mempool.c @@ -46,124 +46,124 @@ static struct panfrost_bo * panfrost_pool_alloc_backing(struct panfrost_pool *pool, size_t bo_sz) { - /* We don't know what the BO will be used for, so let's flag it - * RW and attach it to both the fragment and vertex/tiler jobs. - * TODO: if we want fine grained BO assignment we should pass - * flags to this function and keep the read/write, - * fragment/vertex+tiler pools separate. - */ - struct panfrost_bo *bo = panfrost_bo_create(pool->base.dev, bo_sz, - pool->base.create_flags, pool->base.label); + /* We don't know what the BO will be used for, so let's flag it + * RW and attach it to both the fragment and vertex/tiler jobs. + * TODO: if we want fine grained BO assignment we should pass + * flags to this function and keep the read/write, + * fragment/vertex+tiler pools separate. + */ + struct panfrost_bo *bo = panfrost_bo_create( + pool->base.dev, bo_sz, pool->base.create_flags, pool->base.label); - if (pool->owned) - util_dynarray_append(&pool->bos, struct panfrost_bo *, bo); - else - panfrost_bo_unreference(pool->transient_bo); + if (pool->owned) + util_dynarray_append(&pool->bos, struct panfrost_bo *, bo); + else + panfrost_bo_unreference(pool->transient_bo); - pool->transient_bo = bo; - pool->transient_offset = 0; + pool->transient_bo = bo; + pool->transient_offset = 0; - return bo; + return bo; } void panfrost_pool_init(struct panfrost_pool *pool, void *memctx, - struct panfrost_device *dev, - unsigned create_flags, size_t slab_size, const char *label, - bool prealloc, bool owned) + struct panfrost_device *dev, unsigned create_flags, + size_t slab_size, const char *label, bool prealloc, + bool owned) { - memset(pool, 0, sizeof(*pool)); - pan_pool_init(&pool->base, dev, create_flags, slab_size, label); - pool->owned = owned; + memset(pool, 0, sizeof(*pool)); + pan_pool_init(&pool->base, dev, create_flags, slab_size, label); + pool->owned = owned; - if (owned) - util_dynarray_init(&pool->bos, memctx); + if (owned) + util_dynarray_init(&pool->bos, memctx); - if (prealloc) - panfrost_pool_alloc_backing(pool, pool->base.slab_size); + if (prealloc) + panfrost_pool_alloc_backing(pool, pool->base.slab_size); } void panfrost_pool_cleanup(struct panfrost_pool *pool) { - if (!pool->owned) { - panfrost_bo_unreference(pool->transient_bo); - return; - } + if (!pool->owned) { + panfrost_bo_unreference(pool->transient_bo); + return; + } - util_dynarray_foreach(&pool->bos, struct panfrost_bo *, bo) - panfrost_bo_unreference(*bo); + util_dynarray_foreach(&pool->bos, struct panfrost_bo *, bo) + panfrost_bo_unreference(*bo); - util_dynarray_fini(&pool->bos); + util_dynarray_fini(&pool->bos); } void panfrost_pool_get_bo_handles(struct panfrost_pool *pool, uint32_t *handles) { - assert(pool->owned && "pool does not track BOs in unowned mode"); + assert(pool->owned && "pool does not track BOs in unowned mode"); - unsigned idx = 0; - util_dynarray_foreach(&pool->bos, struct panfrost_bo *, bo) { - assert((*bo)->gem_handle > 0); - handles[idx++] = (*bo)->gem_handle; + unsigned idx = 0; + util_dynarray_foreach(&pool->bos, struct panfrost_bo *, bo) { + assert((*bo)->gem_handle > 0); + handles[idx++] = (*bo)->gem_handle; - /* Update the BO access flags so that panfrost_bo_wait() knows - * about all pending accesses. - * We only keep the READ/WRITE info since this is all the BO - * wait logic cares about. - * We also preserve existing flags as this batch might not - * be the first one to access the BO. - */ - (*bo)->gpu_access |= PAN_BO_ACCESS_RW; - } + /* Update the BO access flags so that panfrost_bo_wait() knows + * about all pending accesses. + * We only keep the READ/WRITE info since this is all the BO + * wait logic cares about. + * We also preserve existing flags as this batch might not + * be the first one to access the BO. + */ + (*bo)->gpu_access |= PAN_BO_ACCESS_RW; + } } #define PAN_GUARD_SIZE 4096 static struct panfrost_ptr -panfrost_pool_alloc_aligned(struct panfrost_pool *pool, size_t sz, unsigned alignment) +panfrost_pool_alloc_aligned(struct panfrost_pool *pool, size_t sz, + unsigned alignment) { - assert(alignment == util_next_power_of_two(alignment)); + assert(alignment == util_next_power_of_two(alignment)); - /* Find or create a suitable BO */ - struct panfrost_bo *bo = pool->transient_bo; - unsigned offset = ALIGN_POT(pool->transient_offset, alignment); + /* Find or create a suitable BO */ + struct panfrost_bo *bo = pool->transient_bo; + unsigned offset = ALIGN_POT(pool->transient_offset, alignment); #ifdef PAN_DBG_OVERFLOW - if (unlikely(pool->base.dev->debug & PAN_DBG_OVERFLOW) && - !(pool->base.create_flags & PAN_BO_INVISIBLE)) { - unsigned aligned = ALIGN_POT(sz, sysconf(_SC_PAGESIZE)); - unsigned bo_size = aligned + PAN_GUARD_SIZE; + if (unlikely(pool->base.dev->debug & PAN_DBG_OVERFLOW) && + !(pool->base.create_flags & PAN_BO_INVISIBLE)) { + unsigned aligned = ALIGN_POT(sz, sysconf(_SC_PAGESIZE)); + unsigned bo_size = aligned + PAN_GUARD_SIZE; - bo = panfrost_pool_alloc_backing(pool, bo_size); - memset(bo->ptr.cpu, 0xbb, bo_size); + bo = panfrost_pool_alloc_backing(pool, bo_size); + memset(bo->ptr.cpu, 0xbb, bo_size); - /* Place the object as close as possible to the protected - * region at the end of the buffer while keeping alignment. */ - offset = ROUND_DOWN_TO(aligned - sz, alignment); + /* Place the object as close as possible to the protected + * region at the end of the buffer while keeping alignment. */ + offset = ROUND_DOWN_TO(aligned - sz, alignment); - if (mprotect(bo->ptr.cpu + aligned, - PAN_GUARD_SIZE, PROT_NONE) == -1) - perror("mprotect"); + if (mprotect(bo->ptr.cpu + aligned, PAN_GUARD_SIZE, PROT_NONE) == -1) + perror("mprotect"); - pool->transient_bo = NULL; - } + pool->transient_bo = NULL; + } #endif - /* If we don't fit, allocate a new backing */ - if (unlikely(bo == NULL || (offset + sz) >= pool->base.slab_size)) { - bo = panfrost_pool_alloc_backing(pool, - ALIGN_POT(MAX2(pool->base.slab_size, sz), 4096)); - offset = 0; - } + /* If we don't fit, allocate a new backing */ + if (unlikely(bo == NULL || (offset + sz) >= pool->base.slab_size)) { + bo = panfrost_pool_alloc_backing( + pool, ALIGN_POT(MAX2(pool->base.slab_size, sz), 4096)); + offset = 0; + } - pool->transient_offset = offset + sz; + pool->transient_offset = offset + sz; - struct panfrost_ptr ret = { - .cpu = bo->ptr.cpu + offset, - .gpu = bo->ptr.gpu + offset, - }; + struct panfrost_ptr ret = { + .cpu = bo->ptr.cpu + offset, + .gpu = bo->ptr.gpu + offset, + }; - return ret; + return ret; } PAN_POOL_ALLOCATOR(struct panfrost_pool, panfrost_pool_alloc_aligned) diff --git a/src/gallium/drivers/panfrost/pan_mempool.h b/src/gallium/drivers/panfrost/pan_mempool.h index 5b75a744515..e864176f373 100644 --- a/src/gallium/drivers/panfrost/pan_mempool.h +++ b/src/gallium/drivers/panfrost/pan_mempool.h @@ -31,37 +31,37 @@ be unowned for persistent uploads. */ struct panfrost_pool { - /* Inherit from pan_pool */ - struct pan_pool base; + /* Inherit from pan_pool */ + struct pan_pool base; - /* BOs allocated by this pool */ - struct util_dynarray bos; + /* BOs allocated by this pool */ + struct util_dynarray bos; - /* Current transient BO */ - struct panfrost_bo *transient_bo; + /* Current transient BO */ + struct panfrost_bo *transient_bo; - /* Within the topmost transient BO, how much has been used? */ - unsigned transient_offset; + /* Within the topmost transient BO, how much has been used? */ + unsigned transient_offset; - /* Mode of the pool. BO management is in the pool for owned mode, but - * the consumed for unowned mode. */ - bool owned; + /* Mode of the pool. BO management is in the pool for owned mode, but + * the consumed for unowned mode. */ + bool owned; }; static inline struct panfrost_pool * to_panfrost_pool(struct pan_pool *pool) { - return container_of(pool, struct panfrost_pool, base); + return container_of(pool, struct panfrost_pool, base); } /* Reference to pool allocated memory for an unowned pool */ struct panfrost_pool_ref { - /* Owning BO */ - struct panfrost_bo *bo; + /* Owning BO */ + struct panfrost_bo *bo; - /* Mapped GPU VA */ - mali_ptr gpu; + /* Mapped GPU VA */ + mali_ptr gpu; }; /* Take a reference to an allocation pool. Call directly after allocating from @@ -70,32 +70,30 @@ struct panfrost_pool_ref { static inline struct panfrost_pool_ref panfrost_pool_take_ref(struct panfrost_pool *pool, mali_ptr ptr) { - if (!pool->owned) - panfrost_bo_reference(pool->transient_bo); + if (!pool->owned) + panfrost_bo_reference(pool->transient_bo); - return (struct panfrost_pool_ref) { - .bo = pool->transient_bo, - .gpu = ptr, - }; + return (struct panfrost_pool_ref){ + .bo = pool->transient_bo, + .gpu = ptr, + }; } -void -panfrost_pool_init(struct panfrost_pool *pool, void *memctx, - struct panfrost_device *dev, unsigned create_flags, - size_t slab_size, const char *label, bool prealloc, bool - owned); +void panfrost_pool_init(struct panfrost_pool *pool, void *memctx, + struct panfrost_device *dev, unsigned create_flags, + size_t slab_size, const char *label, bool prealloc, + bool owned); -void -panfrost_pool_cleanup(struct panfrost_pool *pool); +void panfrost_pool_cleanup(struct panfrost_pool *pool); static inline unsigned panfrost_pool_num_bos(struct panfrost_pool *pool) { - assert(pool->owned && "pool does not track BOs in unowned mode"); - return util_dynarray_num_elements(&pool->bos, struct panfrost_bo *); + assert(pool->owned && "pool does not track BOs in unowned mode"); + return util_dynarray_num_elements(&pool->bos, struct panfrost_bo *); } -void -panfrost_pool_get_bo_handles(struct panfrost_pool *pool, uint32_t *handles); +void panfrost_pool_get_bo_handles(struct panfrost_pool *pool, + uint32_t *handles); #endif diff --git a/src/gallium/drivers/panfrost/pan_public.h b/src/gallium/drivers/panfrost/pan_public.h index c7e72f94246..ed21ccdda60 100644 --- a/src/gallium/drivers/panfrost/pan_public.h +++ b/src/gallium/drivers/panfrost/pan_public.h @@ -31,8 +31,7 @@ extern "C" { struct pipe_screen; struct renderonly; -struct pipe_screen * -panfrost_create_screen(int fd, struct renderonly *ro); +struct pipe_screen *panfrost_create_screen(int fd, struct renderonly *ro); #ifdef __cplusplus } diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c index 52d44fc62bf..6e87fc95e48 100644 --- a/src/gallium/drivers/panfrost/pan_resource.c +++ b/src/gallium/drivers/panfrost/pan_resource.c @@ -30,286 +30,279 @@ * */ -#include #include +#include #include "drm-uapi/drm_fourcc.h" #include "frontend/winsys_handle.h" #include "util/format/u_format.h" +#include "util/u_drm.h" +#include "util/u_gen_mipmap.h" #include "util/u_memory.h" #include "util/u_surface.h" #include "util/u_transfer.h" #include "util/u_transfer_helper.h" -#include "util/u_gen_mipmap.h" -#include "util/u_drm.h" +#include "decode.h" #include "pan_bo.h" #include "pan_context.h" -#include "pan_screen.h" #include "pan_resource.h" -#include "pan_util.h" +#include "pan_screen.h" #include "pan_tiling.h" -#include "decode.h" +#include "pan_util.h" static void panfrost_clear_depth_stencil(struct pipe_context *pipe, - struct pipe_surface *dst, - unsigned clear_flags, - double depth, - unsigned stencil, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, + struct pipe_surface *dst, unsigned clear_flags, + double depth, unsigned stencil, unsigned dstx, + unsigned dsty, unsigned width, unsigned height, bool render_condition_enabled) { - struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_context *ctx = pan_context(pipe); - if (render_condition_enabled && - !panfrost_render_condition_check(ctx)) - return; + if (render_condition_enabled && !panfrost_render_condition_check(ctx)) + return; - panfrost_blitter_save(ctx, render_condition_enabled); - util_blitter_clear_depth_stencil(ctx->blitter, dst, - clear_flags, depth, stencil, - dstx, dsty, width, height); + panfrost_blitter_save(ctx, render_condition_enabled); + util_blitter_clear_depth_stencil(ctx->blitter, dst, clear_flags, depth, + stencil, dstx, dsty, width, height); } static void panfrost_clear_render_target(struct pipe_context *pipe, struct pipe_surface *dst, - const union pipe_color_union *color, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, + const union pipe_color_union *color, unsigned dstx, + unsigned dsty, unsigned width, unsigned height, bool render_condition_enabled) { - struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_context *ctx = pan_context(pipe); - if (render_condition_enabled && - !panfrost_render_condition_check(ctx)) - return; + if (render_condition_enabled && !panfrost_render_condition_check(ctx)) + return; - panfrost_blitter_save(ctx, render_condition_enabled); - util_blitter_clear_render_target(ctx->blitter, dst, color, - dstx, dsty, width, height); + panfrost_blitter_save(ctx, render_condition_enabled); + util_blitter_clear_render_target(ctx->blitter, dst, color, dstx, dsty, width, + height); } static struct pipe_resource * panfrost_resource_from_handle(struct pipe_screen *pscreen, const struct pipe_resource *templat, - struct winsys_handle *whandle, - unsigned usage) + struct winsys_handle *whandle, unsigned usage) { - struct panfrost_device *dev = pan_device(pscreen); - struct panfrost_resource *rsc; - struct pipe_resource *prsc; + struct panfrost_device *dev = pan_device(pscreen); + struct panfrost_resource *rsc; + struct pipe_resource *prsc; - assert(whandle->type == WINSYS_HANDLE_TYPE_FD); + assert(whandle->type == WINSYS_HANDLE_TYPE_FD); - rsc = CALLOC_STRUCT(panfrost_resource); - if (!rsc) - return NULL; + rsc = CALLOC_STRUCT(panfrost_resource); + if (!rsc) + return NULL; - prsc = &rsc->base; + prsc = &rsc->base; - *prsc = *templat; + *prsc = *templat; - pipe_reference_init(&prsc->reference, 1); - prsc->screen = pscreen; + pipe_reference_init(&prsc->reference, 1); + prsc->screen = pscreen; - uint64_t mod = whandle->modifier == DRM_FORMAT_MOD_INVALID ? - DRM_FORMAT_MOD_LINEAR : whandle->modifier; - enum mali_texture_dimension dim = - panfrost_translate_texture_dimension(templat->target); - struct pan_image_explicit_layout explicit_layout = { - .offset = whandle->offset, - .row_stride = panfrost_from_legacy_stride(whandle->stride, templat->format, mod), - }; + uint64_t mod = whandle->modifier == DRM_FORMAT_MOD_INVALID + ? DRM_FORMAT_MOD_LINEAR + : whandle->modifier; + enum mali_texture_dimension dim = + panfrost_translate_texture_dimension(templat->target); + struct pan_image_explicit_layout explicit_layout = { + .offset = whandle->offset, + .row_stride = + panfrost_from_legacy_stride(whandle->stride, templat->format, mod), + }; - rsc->image.layout = (struct pan_image_layout) { - .modifier = mod, - .format = templat->format, - .dim = dim, - .width = prsc->width0, - .height = prsc->height0, - .depth = prsc->depth0, - .array_size = prsc->array_size, - .nr_samples = MAX2(prsc->nr_samples, 1), - .nr_slices = 1, - }; + rsc->image.layout = (struct pan_image_layout){ + .modifier = mod, + .format = templat->format, + .dim = dim, + .width = prsc->width0, + .height = prsc->height0, + .depth = prsc->depth0, + .array_size = prsc->array_size, + .nr_samples = MAX2(prsc->nr_samples, 1), + .nr_slices = 1, + }; - bool valid = pan_image_layout_init(&rsc->image.layout, &explicit_layout); + bool valid = pan_image_layout_init(&rsc->image.layout, &explicit_layout); - if (!valid) { - FREE(rsc); - return NULL; - } + if (!valid) { + FREE(rsc); + return NULL; + } - rsc->image.data.bo = panfrost_bo_import(dev, whandle->handle); - /* Sometimes an import can fail e.g. on an invalid buffer fd, out of - * memory space to mmap it etc. - */ - if (!rsc->image.data.bo) { - FREE(rsc); - return NULL; - } + rsc->image.data.bo = panfrost_bo_import(dev, whandle->handle); + /* Sometimes an import can fail e.g. on an invalid buffer fd, out of + * memory space to mmap it etc. + */ + if (!rsc->image.data.bo) { + FREE(rsc); + return NULL; + } - rsc->modifier_constant = true; + rsc->modifier_constant = true; - BITSET_SET(rsc->valid.data, 0); - panfrost_resource_set_damage_region(pscreen, &rsc->base, 0, NULL); + BITSET_SET(rsc->valid.data, 0); + panfrost_resource_set_damage_region(pscreen, &rsc->base, 0, NULL); - if (dev->ro) { - rsc->scanout = - renderonly_create_gpu_import_for_resource(prsc, dev->ro, NULL); - /* failure is expected in some cases.. */ - } + if (dev->ro) { + rsc->scanout = + renderonly_create_gpu_import_for_resource(prsc, dev->ro, NULL); + /* failure is expected in some cases.. */ + } - return prsc; + return prsc; } static bool panfrost_resource_get_handle(struct pipe_screen *pscreen, - struct pipe_context *ctx, - struct pipe_resource *pt, - struct winsys_handle *handle, - unsigned usage) + struct pipe_context *ctx, struct pipe_resource *pt, + struct winsys_handle *handle, unsigned usage) { - struct panfrost_device *dev = pan_device(pscreen); - struct panfrost_resource *rsrc; - struct renderonly_scanout *scanout; - struct pipe_resource *cur = pt; + struct panfrost_device *dev = pan_device(pscreen); + struct panfrost_resource *rsrc; + struct renderonly_scanout *scanout; + struct pipe_resource *cur = pt; - /* Even though panfrost doesn't support multi-planar formats, we - * can get here through GBM, which does. Walk the list of planes - * to find the right one. - */ - for (int i = 0; i < handle->plane; i++) { - cur = cur->next; - if (!cur) - return false; - } - rsrc = pan_resource(cur); - scanout = rsrc->scanout; + /* Even though panfrost doesn't support multi-planar formats, we + * can get here through GBM, which does. Walk the list of planes + * to find the right one. + */ + for (int i = 0; i < handle->plane; i++) { + cur = cur->next; + if (!cur) + return false; + } + rsrc = pan_resource(cur); + scanout = rsrc->scanout; - handle->modifier = rsrc->image.layout.modifier; - rsrc->modifier_constant = true; + handle->modifier = rsrc->image.layout.modifier; + rsrc->modifier_constant = true; - if (handle->type == WINSYS_HANDLE_TYPE_KMS && dev->ro) { - return renderonly_get_handle(scanout, handle); - } else if (handle->type == WINSYS_HANDLE_TYPE_KMS) { - handle->handle = rsrc->image.data.bo->gem_handle; - } else if (handle->type == WINSYS_HANDLE_TYPE_FD) { - int fd = panfrost_bo_export(rsrc->image.data.bo); + if (handle->type == WINSYS_HANDLE_TYPE_KMS && dev->ro) { + return renderonly_get_handle(scanout, handle); + } else if (handle->type == WINSYS_HANDLE_TYPE_KMS) { + handle->handle = rsrc->image.data.bo->gem_handle; + } else if (handle->type == WINSYS_HANDLE_TYPE_FD) { + int fd = panfrost_bo_export(rsrc->image.data.bo); - if (fd < 0) - return false; + if (fd < 0) + return false; - handle->handle = fd; - } else { - /* Other handle types not supported */ - return false; - } + handle->handle = fd; + } else { + /* Other handle types not supported */ + return false; + } - handle->stride = panfrost_get_legacy_stride(&rsrc->image.layout, 0); - handle->offset = rsrc->image.layout.slices[0].offset; - return true; + handle->stride = panfrost_get_legacy_stride(&rsrc->image.layout, 0); + handle->offset = rsrc->image.layout.slices[0].offset; + return true; } static bool panfrost_resource_get_param(struct pipe_screen *pscreen, - struct pipe_context *pctx, struct pipe_resource *prsc, - unsigned plane, unsigned layer, unsigned level, - enum pipe_resource_param param, - unsigned usage, uint64_t *value) + struct pipe_context *pctx, + struct pipe_resource *prsc, unsigned plane, + unsigned layer, unsigned level, + enum pipe_resource_param param, unsigned usage, + uint64_t *value) { - struct panfrost_resource *rsrc = (struct panfrost_resource *) prsc; - struct pipe_resource *cur; - unsigned count; + struct panfrost_resource *rsrc = (struct panfrost_resource *)prsc; + struct pipe_resource *cur; + unsigned count; - switch (param) { - case PIPE_RESOURCE_PARAM_STRIDE: - *value = panfrost_get_legacy_stride(&rsrc->image.layout, level); - return true; - case PIPE_RESOURCE_PARAM_OFFSET: - *value = rsrc->image.layout.slices[level].offset; - return true; - case PIPE_RESOURCE_PARAM_MODIFIER: - *value = rsrc->image.layout.modifier; - return true; - case PIPE_RESOURCE_PARAM_NPLANES: - /* Panfrost doesn't directly support multi-planar formats, - * but we should still handle this case for gbm users - * that might want to use resources shared with panfrost - * on video processing hardware that does. - */ - for (count = 0, cur = prsc; cur; cur = cur->next) - count++; - *value = count; - return true; - default: - return false; - } + switch (param) { + case PIPE_RESOURCE_PARAM_STRIDE: + *value = panfrost_get_legacy_stride(&rsrc->image.layout, level); + return true; + case PIPE_RESOURCE_PARAM_OFFSET: + *value = rsrc->image.layout.slices[level].offset; + return true; + case PIPE_RESOURCE_PARAM_MODIFIER: + *value = rsrc->image.layout.modifier; + return true; + case PIPE_RESOURCE_PARAM_NPLANES: + /* Panfrost doesn't directly support multi-planar formats, + * but we should still handle this case for gbm users + * that might want to use resources shared with panfrost + * on video processing hardware that does. + */ + for (count = 0, cur = prsc; cur; cur = cur->next) + count++; + *value = count; + return true; + default: + return false; + } } static void panfrost_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc) { - /* TODO */ + /* TODO */ } static struct pipe_surface * -panfrost_create_surface(struct pipe_context *pipe, - struct pipe_resource *pt, +panfrost_create_surface(struct pipe_context *pipe, struct pipe_resource *pt, const struct pipe_surface *surf_tmpl) { - struct panfrost_context *ctx = pan_context(pipe); - struct pipe_surface *ps = NULL; + struct panfrost_context *ctx = pan_context(pipe); + struct pipe_surface *ps = NULL; - pan_legalize_afbc_format(ctx, pan_resource(pt), surf_tmpl->format); + pan_legalize_afbc_format(ctx, pan_resource(pt), surf_tmpl->format); - ps = CALLOC_STRUCT(pipe_surface); + ps = CALLOC_STRUCT(pipe_surface); - if (ps) { - pipe_reference_init(&ps->reference, 1); - pipe_resource_reference(&ps->texture, pt); - ps->context = pipe; - ps->format = surf_tmpl->format; + if (ps) { + pipe_reference_init(&ps->reference, 1); + pipe_resource_reference(&ps->texture, pt); + ps->context = pipe; + ps->format = surf_tmpl->format; - if (pt->target != PIPE_BUFFER) { - assert(surf_tmpl->u.tex.level <= pt->last_level); - ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level); - ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level); - ps->nr_samples = surf_tmpl->nr_samples; - ps->u.tex.level = surf_tmpl->u.tex.level; - ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer; - ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer; - } else { - /* setting width as number of elements should get us correct renderbuffer width */ - ps->width = surf_tmpl->u.buf.last_element - surf_tmpl->u.buf.first_element + 1; - ps->height = pt->height0; - ps->u.buf.first_element = surf_tmpl->u.buf.first_element; - ps->u.buf.last_element = surf_tmpl->u.buf.last_element; - assert(ps->u.buf.first_element <= ps->u.buf.last_element); - assert(ps->u.buf.last_element < ps->width); - } - } + if (pt->target != PIPE_BUFFER) { + assert(surf_tmpl->u.tex.level <= pt->last_level); + ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level); + ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level); + ps->nr_samples = surf_tmpl->nr_samples; + ps->u.tex.level = surf_tmpl->u.tex.level; + ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer; + ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer; + } else { + /* setting width as number of elements should get us correct + * renderbuffer width */ + ps->width = + surf_tmpl->u.buf.last_element - surf_tmpl->u.buf.first_element + 1; + ps->height = pt->height0; + ps->u.buf.first_element = surf_tmpl->u.buf.first_element; + ps->u.buf.last_element = surf_tmpl->u.buf.last_element; + assert(ps->u.buf.first_element <= ps->u.buf.last_element); + assert(ps->u.buf.last_element < ps->width); + } + } - return ps; + return ps; } static void -panfrost_surface_destroy(struct pipe_context *pipe, - struct pipe_surface *surf) +panfrost_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surf) { - assert(surf->texture); - pipe_resource_reference(&surf->texture, NULL); - free(surf); + assert(surf->texture); + pipe_resource_reference(&surf->texture, NULL); + free(surf); } static inline bool panfrost_is_2d(const struct panfrost_resource *pres) { - return (pres->base.target == PIPE_TEXTURE_2D) - || (pres->base.target == PIPE_TEXTURE_RECT); + return (pres->base.target == PIPE_TEXTURE_2D) || + (pres->base.target == PIPE_TEXTURE_RECT); } /* Based on the usage, determine if it makes sense to use u-inteleaved tiling. @@ -321,67 +314,62 @@ panfrost_is_2d(const struct panfrost_resource *pres) static bool panfrost_should_afbc(struct panfrost_device *dev, - const struct panfrost_resource *pres, - enum pipe_format fmt) + const struct panfrost_resource *pres, enum pipe_format fmt) { - /* AFBC resources may be rendered to, textured from, or shared across - * processes, but may not be used as e.g buffers */ - const unsigned valid_binding = - PIPE_BIND_DEPTH_STENCIL | - PIPE_BIND_RENDER_TARGET | - PIPE_BIND_BLENDABLE | - PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED; + /* AFBC resources may be rendered to, textured from, or shared across + * processes, but may not be used as e.g buffers */ + const unsigned valid_binding = + PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET | PIPE_BIND_BLENDABLE | + PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED; - if (pres->base.bind & ~valid_binding) - return false; + if (pres->base.bind & ~valid_binding) + return false; - /* AFBC support is optional */ - if (!dev->has_afbc) - return false; + /* AFBC support is optional */ + if (!dev->has_afbc) + return false; - /* AFBC<-->staging is expensive */ - if (pres->base.usage == PIPE_USAGE_STREAM) - return false; + /* AFBC<-->staging is expensive */ + if (pres->base.usage == PIPE_USAGE_STREAM) + return false; - /* Only a small selection of formats are AFBC'able */ - if (!panfrost_format_supports_afbc(dev, fmt)) - return false; + /* Only a small selection of formats are AFBC'able */ + if (!panfrost_format_supports_afbc(dev, fmt)) + return false; - /* AFBC does not support layered (GLES3 style) multisampling. Use - * EXT_multisampled_render_to_texture instead */ - if (pres->base.nr_samples > 1) - return false; + /* AFBC does not support layered (GLES3 style) multisampling. Use + * EXT_multisampled_render_to_texture instead */ + if (pres->base.nr_samples > 1) + return false; - switch (pres->base.target) { - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - break; + switch (pres->base.target) { + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + break; - case PIPE_TEXTURE_3D: - /* 3D AFBC is only supported on Bifrost v7+. It's supposed to - * be supported on Midgard but it doesn't seem to work */ - if (dev->arch != 7) - return false; + case PIPE_TEXTURE_3D: + /* 3D AFBC is only supported on Bifrost v7+. It's supposed to + * be supported on Midgard but it doesn't seem to work */ + if (dev->arch != 7) + return false; - break; + break; - default: - return false; - } + default: + return false; + } - /* For one tile, AFBC is a loss compared to u-interleaved */ - if (pres->base.width0 <= 16 && pres->base.height0 <= 16) - return false; + /* For one tile, AFBC is a loss compared to u-interleaved */ + if (pres->base.width0 <= 16 && pres->base.height0 <= 16) + return false; - /* Otherwise, we'd prefer AFBC as it is dramatically more efficient - * than linear or usually even u-interleaved */ - return true; + /* Otherwise, we'd prefer AFBC as it is dramatically more efficient + * than linear or usually even u-interleaved */ + return true; } /* @@ -393,37 +381,31 @@ static bool panfrost_should_tile_afbc(const struct panfrost_device *dev, const struct panfrost_resource *pres) { - return panfrost_afbc_can_tile(dev) && - pres->base.width0 >= 128 && - pres->base.height0 >= 128; + return panfrost_afbc_can_tile(dev) && pres->base.width0 >= 128 && + pres->base.height0 >= 128; } static bool panfrost_should_tile(struct panfrost_device *dev, - const struct panfrost_resource *pres, - enum pipe_format fmt) + const struct panfrost_resource *pres, enum pipe_format fmt) { - const unsigned valid_binding = - PIPE_BIND_DEPTH_STENCIL | - PIPE_BIND_RENDER_TARGET | - PIPE_BIND_BLENDABLE | - PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED; + const unsigned valid_binding = + PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET | PIPE_BIND_BLENDABLE | + PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED; - /* The purpose of tiling is improving locality in both X- and - * Y-directions. If there is only a single pixel in either direction, - * tiling does not make sense; using a linear layout instead is optimal - * for both memory usage and performance. - */ - if (MIN2(pres->base.width0, pres->base.height0) < 2) - return false; + /* The purpose of tiling is improving locality in both X- and + * Y-directions. If there is only a single pixel in either direction, + * tiling does not make sense; using a linear layout instead is optimal + * for both memory usage and performance. + */ + if (MIN2(pres->base.width0, pres->base.height0) < 2) + return false; - bool can_tile = (pres->base.target != PIPE_BUFFER) - && ((pres->base.bind & ~valid_binding) == 0); + bool can_tile = (pres->base.target != PIPE_BUFFER) && + ((pres->base.bind & ~valid_binding) == 0); - return can_tile && (pres->base.usage != PIPE_USAGE_STREAM); + return can_tile && (pres->base.usage != PIPE_USAGE_STREAM); } static uint64_t @@ -431,111 +413,107 @@ panfrost_best_modifier(struct panfrost_device *dev, const struct panfrost_resource *pres, enum pipe_format fmt) { - /* Force linear textures when debugging tiling/compression */ - if (unlikely(dev->debug & PAN_DBG_LINEAR)) - return DRM_FORMAT_MOD_LINEAR; + /* Force linear textures when debugging tiling/compression */ + if (unlikely(dev->debug & PAN_DBG_LINEAR)) + return DRM_FORMAT_MOD_LINEAR; - if (panfrost_should_afbc(dev, pres, fmt)) { - uint64_t afbc = - AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_SPARSE; + if (panfrost_should_afbc(dev, pres, fmt)) { + uint64_t afbc = AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | AFBC_FORMAT_MOD_SPARSE; - if (panfrost_afbc_can_ytr(pres->base.format)) - afbc |= AFBC_FORMAT_MOD_YTR; + if (panfrost_afbc_can_ytr(pres->base.format)) + afbc |= AFBC_FORMAT_MOD_YTR; - if (panfrost_should_tile_afbc(dev, pres)) - afbc |= AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SC; + if (panfrost_should_tile_afbc(dev, pres)) + afbc |= AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SC; - return DRM_FORMAT_MOD_ARM_AFBC(afbc); - } else if (panfrost_should_tile(dev, pres, fmt)) - return DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED; - else - return DRM_FORMAT_MOD_LINEAR; + return DRM_FORMAT_MOD_ARM_AFBC(afbc); + } else if (panfrost_should_tile(dev, pres, fmt)) + return DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED; + else + return DRM_FORMAT_MOD_LINEAR; } static bool -panfrost_should_checksum(const struct panfrost_device *dev, const struct panfrost_resource *pres) +panfrost_should_checksum(const struct panfrost_device *dev, + const struct panfrost_resource *pres) { - /* When checksumming is enabled, the tile data must fit in the - * size of the writeback buffer, so don't checksum formats - * that use too much space. */ + /* When checksumming is enabled, the tile data must fit in the + * size of the writeback buffer, so don't checksum formats + * that use too much space. */ - unsigned bytes_per_pixel_max = (dev->arch == 6) ? 6 : 4; + unsigned bytes_per_pixel_max = (dev->arch == 6) ? 6 : 4; - unsigned bytes_per_pixel = MAX2(pres->base.nr_samples, 1) * - util_format_get_blocksize(pres->base.format); + unsigned bytes_per_pixel = MAX2(pres->base.nr_samples, 1) * + util_format_get_blocksize(pres->base.format); - return pres->base.bind & PIPE_BIND_RENDER_TARGET && - panfrost_is_2d(pres) && - bytes_per_pixel <= bytes_per_pixel_max && - pres->base.last_level == 0 && - !(dev->debug & PAN_DBG_NO_CRC); + return pres->base.bind & PIPE_BIND_RENDER_TARGET && panfrost_is_2d(pres) && + bytes_per_pixel <= bytes_per_pixel_max && + pres->base.last_level == 0 && !(dev->debug & PAN_DBG_NO_CRC); } static void panfrost_resource_setup(struct panfrost_device *dev, - struct panfrost_resource *pres, - uint64_t modifier, enum pipe_format fmt) + struct panfrost_resource *pres, uint64_t modifier, + enum pipe_format fmt) { - uint64_t chosen_mod = modifier != DRM_FORMAT_MOD_INVALID ? - modifier : panfrost_best_modifier(dev, pres, fmt); - enum mali_texture_dimension dim = - panfrost_translate_texture_dimension(pres->base.target); + uint64_t chosen_mod = modifier != DRM_FORMAT_MOD_INVALID + ? modifier + : panfrost_best_modifier(dev, pres, fmt); + enum mali_texture_dimension dim = + panfrost_translate_texture_dimension(pres->base.target); - /* We can only switch tiled->linear if the resource isn't already - * linear and if we control the modifier */ - pres->modifier_constant = - !(chosen_mod != DRM_FORMAT_MOD_LINEAR && - modifier == DRM_FORMAT_MOD_INVALID); + /* We can only switch tiled->linear if the resource isn't already + * linear and if we control the modifier */ + pres->modifier_constant = !(chosen_mod != DRM_FORMAT_MOD_LINEAR && + modifier == DRM_FORMAT_MOD_INVALID); - /* Z32_S8X24 variants are actually stored in 2 planes (one per - * component), we have to adjust the format on the first plane. - */ - if (fmt == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) - fmt = PIPE_FORMAT_Z32_FLOAT; + /* Z32_S8X24 variants are actually stored in 2 planes (one per + * component), we have to adjust the format on the first plane. + */ + if (fmt == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) + fmt = PIPE_FORMAT_Z32_FLOAT; - pres->image.layout = (struct pan_image_layout) { - .modifier = chosen_mod, - .format = fmt, - .dim = dim, - .width = pres->base.width0, - .height = pres->base.height0, - .depth = pres->base.depth0, - .array_size = pres->base.array_size, - .nr_samples = MAX2(pres->base.nr_samples, 1), - .nr_slices = pres->base.last_level + 1, - .crc = panfrost_should_checksum(dev, pres), - }; + pres->image.layout = (struct pan_image_layout){ + .modifier = chosen_mod, + .format = fmt, + .dim = dim, + .width = pres->base.width0, + .height = pres->base.height0, + .depth = pres->base.depth0, + .array_size = pres->base.array_size, + .nr_samples = MAX2(pres->base.nr_samples, 1), + .nr_slices = pres->base.last_level + 1, + .crc = panfrost_should_checksum(dev, pres), + }; - ASSERTED bool valid = pan_image_layout_init(&pres->image.layout, NULL); - assert(valid); + ASSERTED bool valid = pan_image_layout_init(&pres->image.layout, NULL); + assert(valid); } static void panfrost_resource_init_afbc_headers(struct panfrost_resource *pres) { - panfrost_bo_mmap(pres->image.data.bo); + panfrost_bo_mmap(pres->image.data.bo); - unsigned nr_samples = MAX2(pres->base.nr_samples, 1); + unsigned nr_samples = MAX2(pres->base.nr_samples, 1); - for (unsigned i = 0; i < pres->base.array_size; ++i) { - for (unsigned l = 0; l <= pres->base.last_level; ++l) { - struct pan_image_slice_layout *slice = &pres->image.layout.slices[l]; + for (unsigned i = 0; i < pres->base.array_size; ++i) { + for (unsigned l = 0; l <= pres->base.last_level; ++l) { + struct pan_image_slice_layout *slice = &pres->image.layout.slices[l]; - for (unsigned s = 0; s < nr_samples; ++s) { - void *ptr = pres->image.data.bo->ptr.cpu + - (i * pres->image.layout.array_stride) + - slice->offset + - (s * slice->afbc.surface_stride); + for (unsigned s = 0; s < nr_samples; ++s) { + void *ptr = pres->image.data.bo->ptr.cpu + + (i * pres->image.layout.array_stride) + slice->offset + + (s * slice->afbc.surface_stride); - /* Zero-ed AFBC headers seem to encode a plain - * black. Let's use this pattern to keep the - * initialization simple. - */ - memset(ptr, 0, slice->afbc.header_size); - } - } - } + /* Zero-ed AFBC headers seem to encode a plain + * black. Let's use this pattern to keep the + * initialization simple. + */ + memset(ptr, 0, slice->afbc.header_size); + } + } + } } void @@ -544,224 +522,219 @@ panfrost_resource_set_damage_region(struct pipe_screen *screen, unsigned int nrects, const struct pipe_box *rects) { - struct panfrost_device *dev = pan_device(screen); - struct panfrost_resource *pres = pan_resource(res); - struct pipe_scissor_state *damage_extent = &pres->damage.extent; - unsigned int i; + struct panfrost_device *dev = pan_device(screen); + struct panfrost_resource *pres = pan_resource(res); + struct pipe_scissor_state *damage_extent = &pres->damage.extent; + unsigned int i; - /* Partial updates are implemented with a tile enable map only on v5. - * Later architectures have a more efficient method of implementing - * partial updates (frame shaders), while earlier architectures lack - * tile enable maps altogether. - */ - if (dev->arch == 5 && nrects > 1) { - if (!pres->damage.tile_map.data) { - pres->damage.tile_map.stride = - ALIGN_POT(DIV_ROUND_UP(res->width0, 32 * 8), 64); - pres->damage.tile_map.size = - pres->damage.tile_map.stride * - DIV_ROUND_UP(res->height0, 32); - pres->damage.tile_map.data = - malloc(pres->damage.tile_map.size); - } + /* Partial updates are implemented with a tile enable map only on v5. + * Later architectures have a more efficient method of implementing + * partial updates (frame shaders), while earlier architectures lack + * tile enable maps altogether. + */ + if (dev->arch == 5 && nrects > 1) { + if (!pres->damage.tile_map.data) { + pres->damage.tile_map.stride = + ALIGN_POT(DIV_ROUND_UP(res->width0, 32 * 8), 64); + pres->damage.tile_map.size = + pres->damage.tile_map.stride * DIV_ROUND_UP(res->height0, 32); + pres->damage.tile_map.data = malloc(pres->damage.tile_map.size); + } - memset(pres->damage.tile_map.data, 0, pres->damage.tile_map.size); - pres->damage.tile_map.enable = true; - } else { - pres->damage.tile_map.enable = false; - } + memset(pres->damage.tile_map.data, 0, pres->damage.tile_map.size); + pres->damage.tile_map.enable = true; + } else { + pres->damage.tile_map.enable = false; + } - /* Track the damage extent: the quad including all damage regions. Will - * be used restrict the rendering area */ + /* Track the damage extent: the quad including all damage regions. Will + * be used restrict the rendering area */ - damage_extent->minx = 0xffff; - damage_extent->miny = 0xffff; + damage_extent->minx = 0xffff; + damage_extent->miny = 0xffff; - unsigned enable_count = 0; + unsigned enable_count = 0; - for (i = 0; i < nrects; i++) { - int x = rects[i].x, w = rects[i].width, h = rects[i].height; - int y = res->height0 - (rects[i].y + h); + for (i = 0; i < nrects; i++) { + int x = rects[i].x, w = rects[i].width, h = rects[i].height; + int y = res->height0 - (rects[i].y + h); - damage_extent->minx = MIN2(damage_extent->minx, x); - damage_extent->miny = MIN2(damage_extent->miny, y); - damage_extent->maxx = MAX2(damage_extent->maxx, - MIN2(x + w, res->width0)); - damage_extent->maxy = MAX2(damage_extent->maxy, - MIN2(y + h, res->height0)); + damage_extent->minx = MIN2(damage_extent->minx, x); + damage_extent->miny = MIN2(damage_extent->miny, y); + damage_extent->maxx = MAX2(damage_extent->maxx, MIN2(x + w, res->width0)); + damage_extent->maxy = + MAX2(damage_extent->maxy, MIN2(y + h, res->height0)); - if (!pres->damage.tile_map.enable) - continue; + if (!pres->damage.tile_map.enable) + continue; - unsigned t_x_start = x / 32; - unsigned t_x_end = (x + w - 1) / 32; - unsigned t_y_start = y / 32; - unsigned t_y_end = (y + h - 1) / 32; + unsigned t_x_start = x / 32; + unsigned t_x_end = (x + w - 1) / 32; + unsigned t_y_start = y / 32; + unsigned t_y_end = (y + h - 1) / 32; - for (unsigned t_y = t_y_start; t_y <= t_y_end; t_y++) { - for (unsigned t_x = t_x_start; t_x <= t_x_end; t_x++) { - unsigned b = (t_y * pres->damage.tile_map.stride * 8) + t_x; + for (unsigned t_y = t_y_start; t_y <= t_y_end; t_y++) { + for (unsigned t_x = t_x_start; t_x <= t_x_end; t_x++) { + unsigned b = (t_y * pres->damage.tile_map.stride * 8) + t_x; - if (BITSET_TEST(pres->damage.tile_map.data, b)) - continue; + if (BITSET_TEST(pres->damage.tile_map.data, b)) + continue; - BITSET_SET(pres->damage.tile_map.data, b); - enable_count++; - } - } - } + BITSET_SET(pres->damage.tile_map.data, b); + enable_count++; + } + } + } - if (nrects == 0) { - damage_extent->minx = 0; - damage_extent->miny = 0; - damage_extent->maxx = res->width0; - damage_extent->maxy = res->height0; - } + if (nrects == 0) { + damage_extent->minx = 0; + damage_extent->miny = 0; + damage_extent->maxx = res->width0; + damage_extent->maxy = res->height0; + } - if (pres->damage.tile_map.enable) { - unsigned t_x_start = damage_extent->minx / 32; - unsigned t_x_end = damage_extent->maxx / 32; - unsigned t_y_start = damage_extent->miny / 32; - unsigned t_y_end = damage_extent->maxy / 32; - unsigned tile_count = (t_x_end - t_x_start + 1) * - (t_y_end - t_y_start + 1); - - /* Don't bother passing a tile-enable-map if the amount of - * tiles to reload is to close to the total number of tiles. - */ - if (tile_count - enable_count < 10) - pres->damage.tile_map.enable = false; - } + if (pres->damage.tile_map.enable) { + unsigned t_x_start = damage_extent->minx / 32; + unsigned t_x_end = damage_extent->maxx / 32; + unsigned t_y_start = damage_extent->miny / 32; + unsigned t_y_end = damage_extent->maxy / 32; + unsigned tile_count = + (t_x_end - t_x_start + 1) * (t_y_end - t_y_start + 1); + /* Don't bother passing a tile-enable-map if the amount of + * tiles to reload is to close to the total number of tiles. + */ + if (tile_count - enable_count < 10) + pres->damage.tile_map.enable = false; + } } static struct pipe_resource * panfrost_resource_create_with_modifier(struct pipe_screen *screen, - const struct pipe_resource *template, - uint64_t modifier) + const struct pipe_resource *template, + uint64_t modifier) { - struct panfrost_device *dev = pan_device(screen); + struct panfrost_device *dev = pan_device(screen); - struct panfrost_resource *so = CALLOC_STRUCT(panfrost_resource); - so->base = *template; - so->base.screen = screen; + struct panfrost_resource *so = CALLOC_STRUCT(panfrost_resource); + so->base = *template; + so->base.screen = screen; - pipe_reference_init(&so->base.reference, 1); + pipe_reference_init(&so->base.reference, 1); - util_range_init(&so->valid_buffer_range); + util_range_init(&so->valid_buffer_range); - if (template->bind & PAN_BIND_SHARED_MASK) { - /* For compatibility with older consumers that may not be - * modifiers aware, treat INVALID as LINEAR for shared - * resources. - */ - if (modifier == DRM_FORMAT_MOD_INVALID) - modifier = DRM_FORMAT_MOD_LINEAR; + if (template->bind & PAN_BIND_SHARED_MASK) { + /* For compatibility with older consumers that may not be + * modifiers aware, treat INVALID as LINEAR for shared + * resources. + */ + if (modifier == DRM_FORMAT_MOD_INVALID) + modifier = DRM_FORMAT_MOD_LINEAR; - /* At any rate, we can't change the modifier later for shared - * resources, since we have no way to propagate the modifier - * change. - */ - so->modifier_constant = true; - } + /* At any rate, we can't change the modifier later for shared + * resources, since we have no way to propagate the modifier + * change. + */ + so->modifier_constant = true; + } - panfrost_resource_setup(dev, so, modifier, template->format); + panfrost_resource_setup(dev, so, modifier, template->format); - /* Guess a label based on the bind */ - unsigned bind = template->bind; - const char *label = - (bind & PIPE_BIND_INDEX_BUFFER) ? "Index buffer" : - (bind & PIPE_BIND_SCANOUT) ? "Scanout" : - (bind & PIPE_BIND_DISPLAY_TARGET) ? "Display target" : - (bind & PIPE_BIND_SHARED) ? "Shared resource" : - (bind & PIPE_BIND_RENDER_TARGET) ? "Render target" : - (bind & PIPE_BIND_DEPTH_STENCIL) ? "Depth/stencil buffer" : - (bind & PIPE_BIND_SAMPLER_VIEW) ? "Texture" : - (bind & PIPE_BIND_VERTEX_BUFFER) ? "Vertex buffer" : - (bind & PIPE_BIND_CONSTANT_BUFFER) ? "Constant buffer" : - (bind & PIPE_BIND_GLOBAL) ? "Global memory" : - (bind & PIPE_BIND_SHADER_BUFFER) ? "Shader buffer" : - (bind & PIPE_BIND_SHADER_IMAGE) ? "Shader image" : - "Other resource"; + /* Guess a label based on the bind */ + unsigned bind = template->bind; + const char *label = (bind & PIPE_BIND_INDEX_BUFFER) ? "Index buffer" + : (bind & PIPE_BIND_SCANOUT) ? "Scanout" + : (bind & PIPE_BIND_DISPLAY_TARGET) ? "Display target" + : (bind & PIPE_BIND_SHARED) ? "Shared resource" + : (bind & PIPE_BIND_RENDER_TARGET) ? "Render target" + : (bind & PIPE_BIND_DEPTH_STENCIL) + ? "Depth/stencil buffer" + : (bind & PIPE_BIND_SAMPLER_VIEW) ? "Texture" + : (bind & PIPE_BIND_VERTEX_BUFFER) ? "Vertex buffer" + : (bind & PIPE_BIND_CONSTANT_BUFFER) ? "Constant buffer" + : (bind & PIPE_BIND_GLOBAL) ? "Global memory" + : (bind & PIPE_BIND_SHADER_BUFFER) ? "Shader buffer" + : (bind & PIPE_BIND_SHADER_IMAGE) ? "Shader image" + : "Other resource"; - if (dev->ro && (template->bind & PIPE_BIND_SCANOUT)) { - struct winsys_handle handle; - struct pan_block_size blocksize = panfrost_block_size(modifier, template->format); + if (dev->ro && (template->bind & PIPE_BIND_SCANOUT)) { + struct winsys_handle handle; + struct pan_block_size blocksize = + panfrost_block_size(modifier, template->format); - /* Block-based texture formats are only used for texture - * compression (not framebuffer compression!), which doesn't - * make sense to share across processes. - */ - assert(util_format_get_blockwidth(template->format) == 1); - assert(util_format_get_blockheight(template->format) == 1); + /* Block-based texture formats are only used for texture + * compression (not framebuffer compression!), which doesn't + * make sense to share across processes. + */ + assert(util_format_get_blockwidth(template->format) == 1); + assert(util_format_get_blockheight(template->format) == 1); - /* Present a resource with similar dimensions that, if allocated - * as a linear image, is big enough to fit the resource in the - * actual layout. For linear images, this is a no-op. For 16x16 - * tiling, this aligns the dimensions to 16x16. - * - * For AFBC, this aligns the width to the superblock width (as - * expected) and adds extra rows to account for the header. This - * is a bit of a lie, but it's the best we can do with dumb - * buffers, which are extremely not meant for AFBC. And yet this - * has to work anyway... - * - * Moral of the story: if you're reading this comment, that - * means you're working on WSI and so it's already too late for - * you. I'm sorry. - */ - unsigned width = ALIGN_POT(template->width0, blocksize.width); - unsigned stride = ALIGN_POT(template->width0, blocksize.width) * - util_format_get_blocksize(template->format); - unsigned size = so->image.layout.data_size; - unsigned effective_rows = DIV_ROUND_UP(size, stride); + /* Present a resource with similar dimensions that, if allocated + * as a linear image, is big enough to fit the resource in the + * actual layout. For linear images, this is a no-op. For 16x16 + * tiling, this aligns the dimensions to 16x16. + * + * For AFBC, this aligns the width to the superblock width (as + * expected) and adds extra rows to account for the header. This + * is a bit of a lie, but it's the best we can do with dumb + * buffers, which are extremely not meant for AFBC. And yet this + * has to work anyway... + * + * Moral of the story: if you're reading this comment, that + * means you're working on WSI and so it's already too late for + * you. I'm sorry. + */ + unsigned width = ALIGN_POT(template->width0, blocksize.width); + unsigned stride = ALIGN_POT(template->width0, blocksize.width) * + util_format_get_blocksize(template->format); + unsigned size = so->image.layout.data_size; + unsigned effective_rows = DIV_ROUND_UP(size, stride); - struct pipe_resource scanout_tmpl = { - .target = so->base.target, - .format = template->format, - .width0 = width, - .height0 = effective_rows, - .depth0 = 1, - .array_size = 1, - }; + struct pipe_resource scanout_tmpl = { + .target = so->base.target, + .format = template->format, + .width0 = width, + .height0 = effective_rows, + .depth0 = 1, + .array_size = 1, + }; - so->scanout = - renderonly_scanout_for_resource(&scanout_tmpl, - dev->ro, - &handle); + so->scanout = + renderonly_scanout_for_resource(&scanout_tmpl, dev->ro, &handle); - if (!so->scanout) { - fprintf(stderr, "Failed to create scanout resource\n"); - free(so); - return NULL; - } - assert(handle.type == WINSYS_HANDLE_TYPE_FD); - so->image.data.bo = panfrost_bo_import(dev, handle.handle); - close(handle.handle); + if (!so->scanout) { + fprintf(stderr, "Failed to create scanout resource\n"); + free(so); + return NULL; + } + assert(handle.type == WINSYS_HANDLE_TYPE_FD); + so->image.data.bo = panfrost_bo_import(dev, handle.handle); + close(handle.handle); - if (!so->image.data.bo) { - free(so); - return NULL; - } - } else { - /* We create a BO immediately but don't bother mapping, since we don't - * care to map e.g. FBOs which the CPU probably won't touch */ + if (!so->image.data.bo) { + free(so); + return NULL; + } + } else { + /* We create a BO immediately but don't bother mapping, since we don't + * care to map e.g. FBOs which the CPU probably won't touch */ - so->image.data.bo = - panfrost_bo_create(dev, so->image.layout.data_size, PAN_BO_DELAY_MMAP, label); + so->image.data.bo = panfrost_bo_create(dev, so->image.layout.data_size, + PAN_BO_DELAY_MMAP, label); - so->constant_stencil = true; - } + so->constant_stencil = true; + } - if (drm_is_afbc(so->image.layout.modifier)) - panfrost_resource_init_afbc_headers(so); + if (drm_is_afbc(so->image.layout.modifier)) + panfrost_resource_init_afbc_headers(so); - panfrost_resource_set_damage_region(screen, &so->base, 0, NULL); + panfrost_resource_set_damage_region(screen, &so->base, 0, NULL); - if (template->bind & PIPE_BIND_INDEX_BUFFER) - so->index_cache = CALLOC_STRUCT(panfrost_minmax_cache); + if (template->bind & PIPE_BIND_INDEX_BUFFER) + so->index_cache = CALLOC_STRUCT(panfrost_minmax_cache); - return (struct pipe_resource *)so; + return (struct pipe_resource *)so; } /* Default is to create a resource as don't care */ @@ -770,8 +743,8 @@ static struct pipe_resource * panfrost_resource_create(struct pipe_screen *screen, const struct pipe_resource *template) { - return panfrost_resource_create_with_modifier(screen, template, - DRM_FORMAT_MOD_INVALID); + return panfrost_resource_create_with_modifier(screen, template, + DRM_FORMAT_MOD_INVALID); } /* If no modifier is specified, we'll choose. Otherwise, the order of @@ -779,39 +752,38 @@ panfrost_resource_create(struct pipe_screen *screen, static struct pipe_resource * panfrost_resource_create_with_modifiers(struct pipe_screen *screen, - const struct pipe_resource *template, - const uint64_t *modifiers, int count) + const struct pipe_resource *template, + const uint64_t *modifiers, int count) { - for (unsigned i = 0; i < PAN_MODIFIER_COUNT; ++i) { - if (drm_find_modifier(pan_best_modifiers[i], modifiers, count)) { - return panfrost_resource_create_with_modifier(screen, template, - pan_best_modifiers[i]); - } - } + for (unsigned i = 0; i < PAN_MODIFIER_COUNT; ++i) { + if (drm_find_modifier(pan_best_modifiers[i], modifiers, count)) { + return panfrost_resource_create_with_modifier(screen, template, + pan_best_modifiers[i]); + } + } - /* If we didn't find one, app specified invalid */ - assert(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID); - return panfrost_resource_create(screen, template); + /* If we didn't find one, app specified invalid */ + assert(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID); + return panfrost_resource_create(screen, template); } static void -panfrost_resource_destroy(struct pipe_screen *screen, - struct pipe_resource *pt) +panfrost_resource_destroy(struct pipe_screen *screen, struct pipe_resource *pt) { - struct panfrost_device *dev = pan_device(screen); - struct panfrost_resource *rsrc = (struct panfrost_resource *) pt; + struct panfrost_device *dev = pan_device(screen); + struct panfrost_resource *rsrc = (struct panfrost_resource *)pt; - if (rsrc->scanout) - renderonly_scanout_destroy(rsrc->scanout, dev->ro); + if (rsrc->scanout) + renderonly_scanout_destroy(rsrc->scanout, dev->ro); - if (rsrc->image.data.bo) - panfrost_bo_unreference(rsrc->image.data.bo); + if (rsrc->image.data.bo) + panfrost_bo_unreference(rsrc->image.data.bo); - free(rsrc->index_cache); - free(rsrc->damage.tile_map.data); + free(rsrc->index_cache); + free(rsrc->damage.tile_map.data); - util_range_destroy(&rsrc->valid_buffer_range); - free(rsrc); + util_range_destroy(&rsrc->valid_buffer_range); + free(rsrc); } /* Most of the time we can do CPU-side transfers, but sometimes we need to use @@ -820,417 +792,411 @@ panfrost_resource_destroy(struct pipe_screen *screen, static struct panfrost_resource * pan_alloc_staging(struct panfrost_context *ctx, struct panfrost_resource *rsc, - unsigned level, const struct pipe_box *box) + unsigned level, const struct pipe_box *box) { - struct pipe_context *pctx = &ctx->base; - struct pipe_resource tmpl = rsc->base; + struct pipe_context *pctx = &ctx->base; + struct pipe_resource tmpl = rsc->base; - tmpl.width0 = box->width; - tmpl.height0 = box->height; - /* for array textures, box->depth is the array_size, otherwise - * for 3d textures, it is the depth: - */ - if (tmpl.array_size > 1) { - if (tmpl.target == PIPE_TEXTURE_CUBE) - tmpl.target = PIPE_TEXTURE_2D_ARRAY; - tmpl.array_size = box->depth; - tmpl.depth0 = 1; - } else { - tmpl.array_size = 1; - tmpl.depth0 = box->depth; - } - tmpl.last_level = 0; - tmpl.bind |= PIPE_BIND_LINEAR; - tmpl.bind &= ~PAN_BIND_SHARED_MASK; + tmpl.width0 = box->width; + tmpl.height0 = box->height; + /* for array textures, box->depth is the array_size, otherwise + * for 3d textures, it is the depth: + */ + if (tmpl.array_size > 1) { + if (tmpl.target == PIPE_TEXTURE_CUBE) + tmpl.target = PIPE_TEXTURE_2D_ARRAY; + tmpl.array_size = box->depth; + tmpl.depth0 = 1; + } else { + tmpl.array_size = 1; + tmpl.depth0 = box->depth; + } + tmpl.last_level = 0; + tmpl.bind |= PIPE_BIND_LINEAR; + tmpl.bind &= ~PAN_BIND_SHARED_MASK; - struct pipe_resource *pstaging = - pctx->screen->resource_create(pctx->screen, &tmpl); - if (!pstaging) - return NULL; + struct pipe_resource *pstaging = + pctx->screen->resource_create(pctx->screen, &tmpl); + if (!pstaging) + return NULL; - return pan_resource(pstaging); + return pan_resource(pstaging); } static void -pan_blit_from_staging(struct pipe_context *pctx, struct panfrost_transfer *trans) +pan_blit_from_staging(struct pipe_context *pctx, + struct panfrost_transfer *trans) { - struct pipe_resource *dst = trans->base.resource; - struct pipe_blit_info blit = {0}; + struct pipe_resource *dst = trans->base.resource; + struct pipe_blit_info blit = {0}; - blit.dst.resource = dst; - blit.dst.format = dst->format; - blit.dst.level = trans->base.level; - blit.dst.box = trans->base.box; - blit.src.resource = trans->staging.rsrc; - blit.src.format = trans->staging.rsrc->format; - blit.src.level = 0; - blit.src.box = trans->staging.box; - blit.mask = util_format_get_mask(blit.src.format); - blit.filter = PIPE_TEX_FILTER_NEAREST; + blit.dst.resource = dst; + blit.dst.format = dst->format; + blit.dst.level = trans->base.level; + blit.dst.box = trans->base.box; + blit.src.resource = trans->staging.rsrc; + blit.src.format = trans->staging.rsrc->format; + blit.src.level = 0; + blit.src.box = trans->staging.box; + blit.mask = util_format_get_mask(blit.src.format); + blit.filter = PIPE_TEX_FILTER_NEAREST; - panfrost_blit(pctx, &blit); + panfrost_blit(pctx, &blit); } static void pan_blit_to_staging(struct pipe_context *pctx, struct panfrost_transfer *trans) { - struct pipe_resource *src = trans->base.resource; - struct pipe_blit_info blit = {0}; + struct pipe_resource *src = trans->base.resource; + struct pipe_blit_info blit = {0}; - blit.src.resource = src; - blit.src.format = src->format; - blit.src.level = trans->base.level; - blit.src.box = trans->base.box; - blit.dst.resource = trans->staging.rsrc; - blit.dst.format = trans->staging.rsrc->format; - blit.dst.level = 0; - blit.dst.box = trans->staging.box; - blit.mask = util_format_get_mask(blit.dst.format); - blit.filter = PIPE_TEX_FILTER_NEAREST; + blit.src.resource = src; + blit.src.format = src->format; + blit.src.level = trans->base.level; + blit.src.box = trans->base.box; + blit.dst.resource = trans->staging.rsrc; + blit.dst.format = trans->staging.rsrc->format; + blit.dst.level = 0; + blit.dst.box = trans->staging.box; + blit.mask = util_format_get_mask(blit.dst.format); + blit.filter = PIPE_TEX_FILTER_NEAREST; - panfrost_blit(pctx, &blit); + panfrost_blit(pctx, &blit); } static void panfrost_load_tiled_images(struct panfrost_transfer *transfer, struct panfrost_resource *rsrc) { - struct pipe_transfer *ptrans = &transfer->base; - unsigned level = ptrans->level; + struct pipe_transfer *ptrans = &transfer->base; + unsigned level = ptrans->level; - /* If the requested level of the image is uninitialized, it's not - * necessary to copy it. Leave the result unintiialized too. - */ - if (!BITSET_TEST(rsrc->valid.data, level)) - return; + /* If the requested level of the image is uninitialized, it's not + * necessary to copy it. Leave the result unintiialized too. + */ + if (!BITSET_TEST(rsrc->valid.data, level)) + return; - struct panfrost_bo *bo = rsrc->image.data.bo; - unsigned stride = panfrost_get_layer_stride(&rsrc->image.layout, level); + struct panfrost_bo *bo = rsrc->image.data.bo; + unsigned stride = panfrost_get_layer_stride(&rsrc->image.layout, level); - /* Otherwise, load each layer separately, required to load from 3D and - * array textures. - */ - for (unsigned z = 0; z < ptrans->box.depth; ++z) { - void *dst = transfer->map + (ptrans->layer_stride * z); - uint8_t *map = bo->ptr.cpu + - rsrc->image.layout.slices[level].offset + - (z + ptrans->box.z) * stride; + /* Otherwise, load each layer separately, required to load from 3D and + * array textures. + */ + for (unsigned z = 0; z < ptrans->box.depth; ++z) { + void *dst = transfer->map + (ptrans->layer_stride * z); + uint8_t *map = bo->ptr.cpu + rsrc->image.layout.slices[level].offset + + (z + ptrans->box.z) * stride; - panfrost_load_tiled_image(dst, map, ptrans->box.x, - ptrans->box.y, ptrans->box.width, - ptrans->box.height, ptrans->stride, - rsrc->image.layout.slices[level].row_stride, - rsrc->image.layout.format); - } + panfrost_load_tiled_image(dst, map, ptrans->box.x, ptrans->box.y, + ptrans->box.width, ptrans->box.height, + ptrans->stride, + rsrc->image.layout.slices[level].row_stride, + rsrc->image.layout.format); + } } static void panfrost_store_tiled_images(struct panfrost_transfer *transfer, struct panfrost_resource *rsrc) { - struct panfrost_bo *bo = rsrc->image.data.bo; - struct pipe_transfer *ptrans = &transfer->base; - unsigned level = ptrans->level; - unsigned stride = panfrost_get_layer_stride(&rsrc->image.layout, level); + struct panfrost_bo *bo = rsrc->image.data.bo; + struct pipe_transfer *ptrans = &transfer->base; + unsigned level = ptrans->level; + unsigned stride = panfrost_get_layer_stride(&rsrc->image.layout, level); - /* Otherwise, store each layer separately, required to store to 3D and - * array textures. - */ - for (unsigned z = 0; z < ptrans->box.depth; ++z) { - void *src = transfer->map + (ptrans->layer_stride * z); - uint8_t *map = bo->ptr.cpu + - rsrc->image.layout.slices[level].offset + - (z + ptrans->box.z) * stride; + /* Otherwise, store each layer separately, required to store to 3D and + * array textures. + */ + for (unsigned z = 0; z < ptrans->box.depth; ++z) { + void *src = transfer->map + (ptrans->layer_stride * z); + uint8_t *map = bo->ptr.cpu + rsrc->image.layout.slices[level].offset + + (z + ptrans->box.z) * stride; - panfrost_store_tiled_image(map, src, - ptrans->box.x, ptrans->box.y, - ptrans->box.width, ptrans->box.height, - rsrc->image.layout.slices[level].row_stride, - ptrans->stride, rsrc->image.layout.format); - } + panfrost_store_tiled_image(map, src, ptrans->box.x, ptrans->box.y, + ptrans->box.width, ptrans->box.height, + rsrc->image.layout.slices[level].row_stride, + ptrans->stride, rsrc->image.layout.format); + } } static bool panfrost_box_covers_resource(const struct pipe_resource *resource, const struct pipe_box *box) { - return resource->last_level == 0 && - util_texrange_covers_whole_level(resource, 0, box->x, box->y, - box->z, box->width, box->height, - box->depth); + return resource->last_level == 0 && + util_texrange_covers_whole_level(resource, 0, box->x, box->y, box->z, + box->width, box->height, box->depth); } static void * -panfrost_ptr_map(struct pipe_context *pctx, - struct pipe_resource *resource, - unsigned level, - unsigned usage, /* a combination of PIPE_MAP_x */ - const struct pipe_box *box, - struct pipe_transfer **out_transfer) +panfrost_ptr_map(struct pipe_context *pctx, struct pipe_resource *resource, + unsigned level, + unsigned usage, /* a combination of PIPE_MAP_x */ + const struct pipe_box *box, + struct pipe_transfer **out_transfer) { - struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_device *dev = pan_device(pctx->screen); - struct panfrost_resource *rsrc = pan_resource(resource); - enum pipe_format format = rsrc->image.layout.format; - int bytes_per_block = util_format_get_blocksize(format); - struct panfrost_bo *bo = rsrc->image.data.bo; + struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_device *dev = pan_device(pctx->screen); + struct panfrost_resource *rsrc = pan_resource(resource); + enum pipe_format format = rsrc->image.layout.format; + int bytes_per_block = util_format_get_blocksize(format); + struct panfrost_bo *bo = rsrc->image.data.bo; - /* Can't map tiled/compressed directly */ - if ((usage & PIPE_MAP_DIRECTLY) && rsrc->image.layout.modifier != DRM_FORMAT_MOD_LINEAR) - return NULL; + /* Can't map tiled/compressed directly */ + if ((usage & PIPE_MAP_DIRECTLY) && + rsrc->image.layout.modifier != DRM_FORMAT_MOD_LINEAR) + return NULL; - struct panfrost_transfer *transfer = rzalloc(pctx, struct panfrost_transfer); - transfer->base.level = level; - transfer->base.usage = usage; - transfer->base.box = *box; + struct panfrost_transfer *transfer = rzalloc(pctx, struct panfrost_transfer); + transfer->base.level = level; + transfer->base.usage = usage; + transfer->base.box = *box; - pipe_resource_reference(&transfer->base.resource, resource); - *out_transfer = &transfer->base; + pipe_resource_reference(&transfer->base.resource, resource); + *out_transfer = &transfer->base; - if (usage & PIPE_MAP_WRITE) - rsrc->constant_stencil = false; + if (usage & PIPE_MAP_WRITE) + rsrc->constant_stencil = false; - /* We don't have s/w routines for AFBC, so use a staging texture */ - if (drm_is_afbc(rsrc->image.layout.modifier)) { - struct panfrost_resource *staging = pan_alloc_staging(ctx, rsrc, level, box); - assert(staging); + /* We don't have s/w routines for AFBC, so use a staging texture */ + if (drm_is_afbc(rsrc->image.layout.modifier)) { + struct panfrost_resource *staging = + pan_alloc_staging(ctx, rsrc, level, box); + assert(staging); - /* Staging resources have one LOD: level 0. Query the strides - * on this LOD. - */ - transfer->base.stride = staging->image.layout.slices[0].row_stride; - transfer->base.layer_stride = - panfrost_get_layer_stride(&staging->image.layout, 0); + /* Staging resources have one LOD: level 0. Query the strides + * on this LOD. + */ + transfer->base.stride = staging->image.layout.slices[0].row_stride; + transfer->base.layer_stride = + panfrost_get_layer_stride(&staging->image.layout, 0); - transfer->staging.rsrc = &staging->base; + transfer->staging.rsrc = &staging->base; - transfer->staging.box = *box; - transfer->staging.box.x = 0; - transfer->staging.box.y = 0; - transfer->staging.box.z = 0; + transfer->staging.box = *box; + transfer->staging.box.x = 0; + transfer->staging.box.y = 0; + transfer->staging.box.z = 0; - assert(transfer->staging.rsrc != NULL); + assert(transfer->staging.rsrc != NULL); - bool valid = BITSET_TEST(rsrc->valid.data, level); + bool valid = BITSET_TEST(rsrc->valid.data, level); - if ((usage & PIPE_MAP_READ) && - (valid || panfrost_any_batch_writes_rsrc(ctx, rsrc))) { - pan_blit_to_staging(pctx, transfer); - panfrost_flush_writer(ctx, staging, "AFBC read staging blit"); - panfrost_bo_wait(staging->image.data.bo, INT64_MAX, false); - } + if ((usage & PIPE_MAP_READ) && + (valid || panfrost_any_batch_writes_rsrc(ctx, rsrc))) { + pan_blit_to_staging(pctx, transfer); + panfrost_flush_writer(ctx, staging, "AFBC read staging blit"); + panfrost_bo_wait(staging->image.data.bo, INT64_MAX, false); + } - panfrost_bo_mmap(staging->image.data.bo); - return staging->image.data.bo->ptr.cpu; - } + panfrost_bo_mmap(staging->image.data.bo); + return staging->image.data.bo->ptr.cpu; + } - /* If we haven't already mmaped, now's the time */ - panfrost_bo_mmap(bo); + /* If we haven't already mmaped, now's the time */ + panfrost_bo_mmap(bo); - if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) - pandecode_inject_mmap(bo->ptr.gpu, bo->ptr.cpu, bo->size, NULL); + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) + pandecode_inject_mmap(bo->ptr.gpu, bo->ptr.cpu, bo->size, NULL); - /* Upgrade writes to uninitialized ranges to UNSYNCHRONIZED */ - if ((usage & PIPE_MAP_WRITE) && - resource->target == PIPE_BUFFER && - !util_ranges_intersect(&rsrc->valid_buffer_range, box->x, box->x + box->width)) { + /* Upgrade writes to uninitialized ranges to UNSYNCHRONIZED */ + if ((usage & PIPE_MAP_WRITE) && resource->target == PIPE_BUFFER && + !util_ranges_intersect(&rsrc->valid_buffer_range, box->x, + box->x + box->width)) { - usage |= PIPE_MAP_UNSYNCHRONIZED; - } + usage |= PIPE_MAP_UNSYNCHRONIZED; + } - /* Upgrade DISCARD_RANGE to WHOLE_RESOURCE if the whole resource is - * being mapped. - */ - if ((usage & PIPE_MAP_DISCARD_RANGE) && - !(usage & PIPE_MAP_UNSYNCHRONIZED) && - !(resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) && - panfrost_box_covers_resource(resource, box) && - !(rsrc->image.data.bo->flags & PAN_BO_SHARED)) { + /* Upgrade DISCARD_RANGE to WHOLE_RESOURCE if the whole resource is + * being mapped. + */ + if ((usage & PIPE_MAP_DISCARD_RANGE) && !(usage & PIPE_MAP_UNSYNCHRONIZED) && + !(resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) && + panfrost_box_covers_resource(resource, box) && + !(rsrc->image.data.bo->flags & PAN_BO_SHARED)) { - usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; - } + usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; + } - bool create_new_bo = usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE; - bool copy_resource = false; + bool create_new_bo = usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE; + bool copy_resource = false; - if (!create_new_bo && - !(usage & PIPE_MAP_UNSYNCHRONIZED) && - !(resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) && - (usage & PIPE_MAP_WRITE) && - panfrost_any_batch_reads_rsrc(ctx, rsrc)) { - /* When a resource to be modified is already being used by a - * pending batch, it is often faster to copy the whole BO than - * to flush and split the frame in two. - */ + if (!create_new_bo && !(usage & PIPE_MAP_UNSYNCHRONIZED) && + !(resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) && + (usage & PIPE_MAP_WRITE) && panfrost_any_batch_reads_rsrc(ctx, rsrc)) { + /* When a resource to be modified is already being used by a + * pending batch, it is often faster to copy the whole BO than + * to flush and split the frame in two. + */ - panfrost_flush_writer(ctx, rsrc, "Shadow resource creation"); - panfrost_bo_wait(bo, INT64_MAX, false); + panfrost_flush_writer(ctx, rsrc, "Shadow resource creation"); + panfrost_bo_wait(bo, INT64_MAX, false); - create_new_bo = true; - copy_resource = !(usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE); - } + create_new_bo = true; + copy_resource = !(usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE); + } - /* Shadowing with separate stencil may require additional accounting. - * Bail in these exotic cases. - */ - if (rsrc->separate_stencil) { - create_new_bo = false; - copy_resource = false; - } + /* Shadowing with separate stencil may require additional accounting. + * Bail in these exotic cases. + */ + if (rsrc->separate_stencil) { + create_new_bo = false; + copy_resource = false; + } - if (create_new_bo) { - /* Make sure we re-emit any descriptors using this resource */ - panfrost_dirty_state_all(ctx); + if (create_new_bo) { + /* Make sure we re-emit any descriptors using this resource */ + panfrost_dirty_state_all(ctx); - /* If the BO is used by one of the pending batches or if it's - * not ready yet (still accessed by one of the already flushed - * batches), we try to allocate a new one to avoid waiting. - */ - if (panfrost_any_batch_reads_rsrc(ctx, rsrc) || - !panfrost_bo_wait(bo, 0, true)) { - /* We want the BO to be MMAPed. */ - uint32_t flags = bo->flags & ~PAN_BO_DELAY_MMAP; - struct panfrost_bo *newbo = NULL; + /* If the BO is used by one of the pending batches or if it's + * not ready yet (still accessed by one of the already flushed + * batches), we try to allocate a new one to avoid waiting. + */ + if (panfrost_any_batch_reads_rsrc(ctx, rsrc) || + !panfrost_bo_wait(bo, 0, true)) { + /* We want the BO to be MMAPed. */ + uint32_t flags = bo->flags & ~PAN_BO_DELAY_MMAP; + struct panfrost_bo *newbo = NULL; - /* When the BO has been imported/exported, we can't - * replace it by another one, otherwise the - * importer/exporter wouldn't see the change we're - * doing to it. - */ - if (!(bo->flags & PAN_BO_SHARED)) - newbo = panfrost_bo_create(dev, bo->size, - flags, bo->label); + /* When the BO has been imported/exported, we can't + * replace it by another one, otherwise the + * importer/exporter wouldn't see the change we're + * doing to it. + */ + if (!(bo->flags & PAN_BO_SHARED)) + newbo = panfrost_bo_create(dev, bo->size, flags, bo->label); - if (newbo) { - if (copy_resource) - memcpy(newbo->ptr.cpu, rsrc->image.data.bo->ptr.cpu, bo->size); + if (newbo) { + if (copy_resource) + memcpy(newbo->ptr.cpu, rsrc->image.data.bo->ptr.cpu, bo->size); - /* Swap the pointers, dropping a reference to - * the old BO which is no long referenced from - * the resource. - */ - panfrost_bo_unreference(rsrc->image.data.bo); - rsrc->image.data.bo = newbo; + /* Swap the pointers, dropping a reference to + * the old BO which is no long referenced from + * the resource. + */ + panfrost_bo_unreference(rsrc->image.data.bo); + rsrc->image.data.bo = newbo; - if (!copy_resource && - drm_is_afbc(rsrc->image.layout.modifier)) - panfrost_resource_init_afbc_headers(rsrc); + if (!copy_resource && drm_is_afbc(rsrc->image.layout.modifier)) + panfrost_resource_init_afbc_headers(rsrc); - bo = newbo; - } else { - /* Allocation failed or was impossible, let's - * fall back on a flush+wait. - */ - panfrost_flush_batches_accessing_rsrc(ctx, rsrc, - "Resource access with high memory pressure"); - panfrost_bo_wait(bo, INT64_MAX, true); - } - } - } else if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) { - if (usage & PIPE_MAP_WRITE) { - panfrost_flush_batches_accessing_rsrc(ctx, rsrc, "Synchronized write"); - panfrost_bo_wait(bo, INT64_MAX, true); - } else if (usage & PIPE_MAP_READ) { - panfrost_flush_writer(ctx, rsrc, "Synchronized read"); - panfrost_bo_wait(bo, INT64_MAX, false); - } - } + bo = newbo; + } else { + /* Allocation failed or was impossible, let's + * fall back on a flush+wait. + */ + panfrost_flush_batches_accessing_rsrc( + ctx, rsrc, "Resource access with high memory pressure"); + panfrost_bo_wait(bo, INT64_MAX, true); + } + } + } else if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) { + if (usage & PIPE_MAP_WRITE) { + panfrost_flush_batches_accessing_rsrc(ctx, rsrc, "Synchronized write"); + panfrost_bo_wait(bo, INT64_MAX, true); + } else if (usage & PIPE_MAP_READ) { + panfrost_flush_writer(ctx, rsrc, "Synchronized read"); + panfrost_bo_wait(bo, INT64_MAX, false); + } + } - /* For access to compressed textures, we want the (x, y, w, h) - * region-of-interest in blocks, not pixels. Then we compute the stride - * between rows of blocks as the width in blocks times the width per - * block, etc. - */ - struct pipe_box box_blocks; - u_box_pixels_to_blocks(&box_blocks, box, format); + /* For access to compressed textures, we want the (x, y, w, h) + * region-of-interest in blocks, not pixels. Then we compute the stride + * between rows of blocks as the width in blocks times the width per + * block, etc. + */ + struct pipe_box box_blocks; + u_box_pixels_to_blocks(&box_blocks, box, format); - if (rsrc->image.layout.modifier == DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED) { - transfer->base.stride = box_blocks.width * bytes_per_block; - transfer->base.layer_stride = transfer->base.stride * box_blocks.height; - transfer->map = ralloc_size(transfer, transfer->base.layer_stride * box->depth); + if (rsrc->image.layout.modifier == + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED) { + transfer->base.stride = box_blocks.width * bytes_per_block; + transfer->base.layer_stride = transfer->base.stride * box_blocks.height; + transfer->map = + ralloc_size(transfer, transfer->base.layer_stride * box->depth); - if (usage & PIPE_MAP_READ) - panfrost_load_tiled_images(transfer, rsrc); + if (usage & PIPE_MAP_READ) + panfrost_load_tiled_images(transfer, rsrc); - return transfer->map; - } else { - assert (rsrc->image.layout.modifier == DRM_FORMAT_MOD_LINEAR); + return transfer->map; + } else { + assert(rsrc->image.layout.modifier == DRM_FORMAT_MOD_LINEAR); - /* Direct, persistent writes create holes in time for - * caching... I don't know if this is actually possible but we - * should still get it right */ + /* Direct, persistent writes create holes in time for + * caching... I don't know if this is actually possible but we + * should still get it right */ - unsigned dpw = PIPE_MAP_DIRECTLY | PIPE_MAP_WRITE | PIPE_MAP_PERSISTENT; + unsigned dpw = PIPE_MAP_DIRECTLY | PIPE_MAP_WRITE | PIPE_MAP_PERSISTENT; - if ((usage & dpw) == dpw && rsrc->index_cache) - return NULL; + if ((usage & dpw) == dpw && rsrc->index_cache) + return NULL; - transfer->base.stride = rsrc->image.layout.slices[level].row_stride; - transfer->base.layer_stride = - panfrost_get_layer_stride(&rsrc->image.layout, level); + transfer->base.stride = rsrc->image.layout.slices[level].row_stride; + transfer->base.layer_stride = + panfrost_get_layer_stride(&rsrc->image.layout, level); - /* By mapping direct-write, we're implicitly already - * initialized (maybe), so be conservative */ + /* By mapping direct-write, we're implicitly already + * initialized (maybe), so be conservative */ - if (usage & PIPE_MAP_WRITE) { - BITSET_SET(rsrc->valid.data, level); - panfrost_minmax_cache_invalidate(rsrc->index_cache, &transfer->base); - } + if (usage & PIPE_MAP_WRITE) { + BITSET_SET(rsrc->valid.data, level); + panfrost_minmax_cache_invalidate(rsrc->index_cache, &transfer->base); + } - return bo->ptr.cpu - + rsrc->image.layout.slices[level].offset - + box->z * transfer->base.layer_stride - + box_blocks.y * rsrc->image.layout.slices[level].row_stride - + box_blocks.x * bytes_per_block; - } + return bo->ptr.cpu + rsrc->image.layout.slices[level].offset + + box->z * transfer->base.layer_stride + + box_blocks.y * rsrc->image.layout.slices[level].row_stride + + box_blocks.x * bytes_per_block; + } } void pan_resource_modifier_convert(struct panfrost_context *ctx, - struct panfrost_resource *rsrc, - uint64_t modifier, const char *reason) + struct panfrost_resource *rsrc, uint64_t modifier, + const char *reason) { - assert(!rsrc->modifier_constant); + assert(!rsrc->modifier_constant); - perf_debug_ctx(ctx, "Disabling AFBC with a blit. Reason: %s", reason); + perf_debug_ctx(ctx, "Disabling AFBC with a blit. Reason: %s", reason); - struct pipe_resource *tmp_prsrc = - panfrost_resource_create_with_modifier( - ctx->base.screen, &rsrc->base, modifier); - struct panfrost_resource *tmp_rsrc = pan_resource(tmp_prsrc); + struct pipe_resource *tmp_prsrc = panfrost_resource_create_with_modifier( + ctx->base.screen, &rsrc->base, modifier); + struct panfrost_resource *tmp_rsrc = pan_resource(tmp_prsrc); - unsigned depth = rsrc->base.target == PIPE_TEXTURE_3D ? - rsrc->base.depth0 : rsrc->base.array_size; + unsigned depth = rsrc->base.target == PIPE_TEXTURE_3D + ? rsrc->base.depth0 + : rsrc->base.array_size; - struct pipe_box box = - { 0, 0, 0, rsrc->base.width0, rsrc->base.height0, depth }; + struct pipe_box box = {0, 0, 0, rsrc->base.width0, rsrc->base.height0, + depth}; - struct pipe_blit_info blit = { - .dst.resource = &tmp_rsrc->base, - .dst.format = tmp_rsrc->base.format, - .dst.box = box, - .src.resource = &rsrc->base, - .src.format = rsrc->base.format, - .src.box = box, - .mask = util_format_get_mask(tmp_rsrc->base.format), - .filter = PIPE_TEX_FILTER_NEAREST, - }; + struct pipe_blit_info blit = { + .dst.resource = &tmp_rsrc->base, + .dst.format = tmp_rsrc->base.format, + .dst.box = box, + .src.resource = &rsrc->base, + .src.format = rsrc->base.format, + .src.box = box, + .mask = util_format_get_mask(tmp_rsrc->base.format), + .filter = PIPE_TEX_FILTER_NEAREST, + }; - for (int i = 0; i <= rsrc->base.last_level; i++) { - if (BITSET_TEST(rsrc->valid.data, i)) { - blit.dst.level = blit.src.level = i; - panfrost_blit(&ctx->base, &blit); - } - } + for (int i = 0; i <= rsrc->base.last_level; i++) { + if (BITSET_TEST(rsrc->valid.data, i)) { + blit.dst.level = blit.src.level = i; + panfrost_blit(&ctx->base, &blit); + } + } - panfrost_bo_unreference(rsrc->image.data.bo); + panfrost_bo_unreference(rsrc->image.data.bo); - rsrc->image.data.bo = tmp_rsrc->image.data.bo; - panfrost_bo_reference(rsrc->image.data.bo); + rsrc->image.data.bo = tmp_rsrc->image.data.bo; + panfrost_bo_reference(rsrc->image.data.bo); - panfrost_resource_setup(pan_device(ctx->base.screen), rsrc, modifier, - blit.dst.format); - pipe_resource_reference(&tmp_prsrc, NULL); + panfrost_resource_setup(pan_device(ctx->base.screen), rsrc, modifier, + blit.dst.format); + pipe_resource_reference(&tmp_prsrc, NULL); } /* Validate that an AFBC resource may be used as a particular format. If it may @@ -1242,18 +1208,18 @@ pan_legalize_afbc_format(struct panfrost_context *ctx, struct panfrost_resource *rsrc, enum pipe_format format) { - struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_device *dev = pan_device(ctx->base.screen); - if (!drm_is_afbc(rsrc->image.layout.modifier)) - return; + if (!drm_is_afbc(rsrc->image.layout.modifier)) + return; - if (panfrost_afbc_format(dev->arch, rsrc->base.format) == - panfrost_afbc_format(dev->arch, format)) - return; + if (panfrost_afbc_format(dev->arch, rsrc->base.format) == + panfrost_afbc_format(dev->arch, format)) + return; - pan_resource_modifier_convert(ctx, rsrc, - DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, - "Reinterpreting AFBC surface as incompatible format"); + pan_resource_modifier_convert( + ctx, rsrc, DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, + "Reinterpreting AFBC surface as incompatible format"); } static bool @@ -1261,271 +1227,259 @@ panfrost_should_linear_convert(struct panfrost_device *dev, struct panfrost_resource *prsrc, struct pipe_transfer *transfer) { - if (prsrc->modifier_constant) - return false; + if (prsrc->modifier_constant) + return false; - /* Overwriting the entire resource indicates streaming, for which - * linear layout is most efficient due to the lack of expensive - * conversion. - * - * For now we just switch to linear after a number of complete - * overwrites to keep things simple, but we could do better. - * - * This mechanism is only implemented for 2D resources. This suffices - * for video players, its intended use case. - */ + /* Overwriting the entire resource indicates streaming, for which + * linear layout is most efficient due to the lack of expensive + * conversion. + * + * For now we just switch to linear after a number of complete + * overwrites to keep things simple, but we could do better. + * + * This mechanism is only implemented for 2D resources. This suffices + * for video players, its intended use case. + */ - bool entire_overwrite = - panfrost_is_2d(prsrc) && - prsrc->base.last_level == 0 && - transfer->box.width == prsrc->base.width0 && - transfer->box.height == prsrc->base.height0 && - transfer->box.x == 0 && - transfer->box.y == 0; + bool entire_overwrite = panfrost_is_2d(prsrc) && + prsrc->base.last_level == 0 && + transfer->box.width == prsrc->base.width0 && + transfer->box.height == prsrc->base.height0 && + transfer->box.x == 0 && transfer->box.y == 0; - if (entire_overwrite) - ++prsrc->modifier_updates; + if (entire_overwrite) + ++prsrc->modifier_updates; - if (prsrc->modifier_updates >= LAYOUT_CONVERT_THRESHOLD) { - perf_debug(dev, "Transitioning to linear due to streaming usage"); - return true; - } else { - return false; - } + if (prsrc->modifier_updates >= LAYOUT_CONVERT_THRESHOLD) { + perf_debug(dev, "Transitioning to linear due to streaming usage"); + return true; + } else { + return false; + } } static void -panfrost_ptr_unmap(struct pipe_context *pctx, - struct pipe_transfer *transfer) +panfrost_ptr_unmap(struct pipe_context *pctx, struct pipe_transfer *transfer) { - /* Gallium expects writeback here, so we tile */ + /* Gallium expects writeback here, so we tile */ - struct panfrost_transfer *trans = pan_transfer(transfer); - struct panfrost_resource *prsrc = (struct panfrost_resource *) transfer->resource; - struct panfrost_device *dev = pan_device(pctx->screen); + struct panfrost_transfer *trans = pan_transfer(transfer); + struct panfrost_resource *prsrc = + (struct panfrost_resource *)transfer->resource; + struct panfrost_device *dev = pan_device(pctx->screen); - if (transfer->usage & PIPE_MAP_WRITE) - prsrc->valid.crc = false; + if (transfer->usage & PIPE_MAP_WRITE) + prsrc->valid.crc = false; - /* AFBC will use a staging resource. `initialized` will be set when the - * fragment job is created; this is deferred to prevent useless surface - * reloads that can cascade into DATA_INVALID_FAULTs due to reading - * malformed AFBC data if uninitialized */ + /* AFBC will use a staging resource. `initialized` will be set when the + * fragment job is created; this is deferred to prevent useless surface + * reloads that can cascade into DATA_INVALID_FAULTs due to reading + * malformed AFBC data if uninitialized */ - if (trans->staging.rsrc) { - if (transfer->usage & PIPE_MAP_WRITE) { - if (panfrost_should_linear_convert(dev, prsrc, transfer)) { + if (trans->staging.rsrc) { + if (transfer->usage & PIPE_MAP_WRITE) { + if (panfrost_should_linear_convert(dev, prsrc, transfer)) { - panfrost_bo_unreference(prsrc->image.data.bo); + panfrost_bo_unreference(prsrc->image.data.bo); - panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR, - prsrc->image.layout.format); + panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR, + prsrc->image.layout.format); - prsrc->image.data.bo = pan_resource(trans->staging.rsrc)->image.data.bo; - panfrost_bo_reference(prsrc->image.data.bo); - } else { - pan_blit_from_staging(pctx, trans); - panfrost_flush_batches_accessing_rsrc(pan_context(pctx), - pan_resource(trans->staging.rsrc), - "AFBC write staging blit"); - } - } + prsrc->image.data.bo = + pan_resource(trans->staging.rsrc)->image.data.bo; + panfrost_bo_reference(prsrc->image.data.bo); + } else { + pan_blit_from_staging(pctx, trans); + panfrost_flush_batches_accessing_rsrc( + pan_context(pctx), pan_resource(trans->staging.rsrc), + "AFBC write staging blit"); + } + } - pipe_resource_reference(&trans->staging.rsrc, NULL); - } + pipe_resource_reference(&trans->staging.rsrc, NULL); + } - /* Tiling will occur in software from a staging cpu buffer */ - if (trans->map) { - struct panfrost_bo *bo = prsrc->image.data.bo; + /* Tiling will occur in software from a staging cpu buffer */ + if (trans->map) { + struct panfrost_bo *bo = prsrc->image.data.bo; - if (transfer->usage & PIPE_MAP_WRITE) { - BITSET_SET(prsrc->valid.data, transfer->level); + if (transfer->usage & PIPE_MAP_WRITE) { + BITSET_SET(prsrc->valid.data, transfer->level); - if (prsrc->image.layout.modifier == DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED) { - if (panfrost_should_linear_convert(dev, prsrc, transfer)) { - panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR, - prsrc->image.layout.format); - if (prsrc->image.layout.data_size > bo->size) { - const char *label = bo->label; - panfrost_bo_unreference(bo); - bo = prsrc->image.data.bo = - panfrost_bo_create(dev, prsrc->image.layout.data_size, 0, label); - assert(bo); - } + if (prsrc->image.layout.modifier == + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED) { + if (panfrost_should_linear_convert(dev, prsrc, transfer)) { + panfrost_resource_setup(dev, prsrc, DRM_FORMAT_MOD_LINEAR, + prsrc->image.layout.format); + if (prsrc->image.layout.data_size > bo->size) { + const char *label = bo->label; + panfrost_bo_unreference(bo); + bo = prsrc->image.data.bo = panfrost_bo_create( + dev, prsrc->image.layout.data_size, 0, label); + assert(bo); + } - util_copy_rect( - bo->ptr.cpu + prsrc->image.layout.slices[0].offset, - prsrc->base.format, - prsrc->image.layout.slices[0].row_stride, - 0, 0, - transfer->box.width, - transfer->box.height, - trans->map, - transfer->stride, - 0, 0); - } else { - panfrost_store_tiled_images(trans, prsrc); - } - } - } - } + util_copy_rect( + bo->ptr.cpu + prsrc->image.layout.slices[0].offset, + prsrc->base.format, prsrc->image.layout.slices[0].row_stride, + 0, 0, transfer->box.width, transfer->box.height, trans->map, + transfer->stride, 0, 0); + } else { + panfrost_store_tiled_images(trans, prsrc); + } + } + } + } + util_range_add(&prsrc->base, &prsrc->valid_buffer_range, transfer->box.x, + transfer->box.x + transfer->box.width); - util_range_add(&prsrc->base, &prsrc->valid_buffer_range, - transfer->box.x, - transfer->box.x + transfer->box.width); + panfrost_minmax_cache_invalidate(prsrc->index_cache, transfer); - panfrost_minmax_cache_invalidate(prsrc->index_cache, transfer); + /* Derefence the resource */ + pipe_resource_reference(&transfer->resource, NULL); - /* Derefence the resource */ - pipe_resource_reference(&transfer->resource, NULL); - - /* Transfer itself is RALLOCed at the moment */ - ralloc_free(transfer); + /* Transfer itself is RALLOCed at the moment */ + ralloc_free(transfer); } static void panfrost_ptr_flush_region(struct pipe_context *pctx, - struct pipe_transfer *transfer, - const struct pipe_box *box) + struct pipe_transfer *transfer, + const struct pipe_box *box) { - struct panfrost_resource *rsc = pan_resource(transfer->resource); + struct panfrost_resource *rsc = pan_resource(transfer->resource); - if (transfer->resource->target == PIPE_BUFFER) { - util_range_add(&rsc->base, &rsc->valid_buffer_range, - transfer->box.x + box->x, - transfer->box.x + box->x + box->width); - } else { - BITSET_SET(rsc->valid.data, transfer->level); - } + if (transfer->resource->target == PIPE_BUFFER) { + util_range_add(&rsc->base, &rsc->valid_buffer_range, + transfer->box.x + box->x, + transfer->box.x + box->x + box->width); + } else { + BITSET_SET(rsc->valid.data, transfer->level); + } } static void -panfrost_invalidate_resource(struct pipe_context *pctx, struct pipe_resource *prsrc) +panfrost_invalidate_resource(struct pipe_context *pctx, + struct pipe_resource *prsrc) { - struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - struct panfrost_resource *rsrc = pan_resource(prsrc); + struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + struct panfrost_resource *rsrc = pan_resource(prsrc); - rsrc->constant_stencil = true; + rsrc->constant_stencil = true; - /* Handle the glInvalidateFramebuffer case */ - if (batch->key.zsbuf && batch->key.zsbuf->texture == prsrc) - batch->resolve &= ~PIPE_CLEAR_DEPTHSTENCIL; + /* Handle the glInvalidateFramebuffer case */ + if (batch->key.zsbuf && batch->key.zsbuf->texture == prsrc) + batch->resolve &= ~PIPE_CLEAR_DEPTHSTENCIL; - for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) { - struct pipe_surface *surf = batch->key.cbufs[i]; + for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) { + struct pipe_surface *surf = batch->key.cbufs[i]; - if (surf && surf->texture == prsrc) - batch->resolve &= ~(PIPE_CLEAR_COLOR0 << i); - } + if (surf && surf->texture == prsrc) + batch->resolve &= ~(PIPE_CLEAR_COLOR0 << i); + } } static enum pipe_format panfrost_resource_get_internal_format(struct pipe_resource *rsrc) { - struct panfrost_resource *prsrc = (struct panfrost_resource *) rsrc; - return prsrc->image.layout.format; + struct panfrost_resource *prsrc = (struct panfrost_resource *)rsrc; + return prsrc->image.layout.format; } static bool -panfrost_generate_mipmap( - struct pipe_context *pctx, - struct pipe_resource *prsrc, - enum pipe_format format, - unsigned base_level, - unsigned last_level, - unsigned first_layer, - unsigned last_layer) +panfrost_generate_mipmap(struct pipe_context *pctx, struct pipe_resource *prsrc, + enum pipe_format format, unsigned base_level, + unsigned last_level, unsigned first_layer, + unsigned last_layer) { - struct panfrost_resource *rsrc = pan_resource(prsrc); + struct panfrost_resource *rsrc = pan_resource(prsrc); - perf_debug_ctx(pan_context(pctx), "Unoptimized mipmap generation"); + perf_debug_ctx(pan_context(pctx), "Unoptimized mipmap generation"); - /* Generating a mipmap invalidates the written levels, so make that - * explicit so we don't try to wallpaper them back and end up with - * u_blitter recursion */ + /* Generating a mipmap invalidates the written levels, so make that + * explicit so we don't try to wallpaper them back and end up with + * u_blitter recursion */ - assert(rsrc->image.data.bo); - for (unsigned l = base_level + 1; l <= last_level; ++l) - BITSET_CLEAR(rsrc->valid.data, l); + assert(rsrc->image.data.bo); + for (unsigned l = base_level + 1; l <= last_level; ++l) + BITSET_CLEAR(rsrc->valid.data, l); - /* Beyond that, we just delegate the hard stuff. */ + /* Beyond that, we just delegate the hard stuff. */ - bool blit_res = util_gen_mipmap( - pctx, prsrc, format, - base_level, last_level, - first_layer, last_layer, - PIPE_TEX_FILTER_LINEAR); + bool blit_res = + util_gen_mipmap(pctx, prsrc, format, base_level, last_level, first_layer, + last_layer, PIPE_TEX_FILTER_LINEAR); - return blit_res; + return blit_res; } static void panfrost_resource_set_stencil(struct pipe_resource *prsrc, struct pipe_resource *stencil) { - pan_resource(prsrc)->separate_stencil = pan_resource(stencil); + pan_resource(prsrc)->separate_stencil = pan_resource(stencil); } static struct pipe_resource * panfrost_resource_get_stencil(struct pipe_resource *prsrc) { - if (!pan_resource(prsrc)->separate_stencil) - return NULL; + if (!pan_resource(prsrc)->separate_stencil) + return NULL; - return &pan_resource(prsrc)->separate_stencil->base; + return &pan_resource(prsrc)->separate_stencil->base; } static const struct u_transfer_vtbl transfer_vtbl = { - .resource_create = panfrost_resource_create, - .resource_destroy = panfrost_resource_destroy, - .transfer_map = panfrost_ptr_map, - .transfer_unmap = panfrost_ptr_unmap, - .transfer_flush_region = panfrost_ptr_flush_region, - .get_internal_format = panfrost_resource_get_internal_format, - .set_stencil = panfrost_resource_set_stencil, - .get_stencil = panfrost_resource_get_stencil, + .resource_create = panfrost_resource_create, + .resource_destroy = panfrost_resource_destroy, + .transfer_map = panfrost_ptr_map, + .transfer_unmap = panfrost_ptr_unmap, + .transfer_flush_region = panfrost_ptr_flush_region, + .get_internal_format = panfrost_resource_get_internal_format, + .set_stencil = panfrost_resource_set_stencil, + .get_stencil = panfrost_resource_get_stencil, }; void panfrost_resource_screen_init(struct pipe_screen *pscreen) { - pscreen->resource_create_with_modifiers = - panfrost_resource_create_with_modifiers; - pscreen->resource_create = u_transfer_helper_resource_create; - pscreen->resource_destroy = u_transfer_helper_resource_destroy; - pscreen->resource_from_handle = panfrost_resource_from_handle; - pscreen->resource_get_handle = panfrost_resource_get_handle; - pscreen->resource_get_param = panfrost_resource_get_param; - pscreen->transfer_helper = u_transfer_helper_create(&transfer_vtbl, - U_TRANSFER_HELPER_SEPARATE_Z32S8 | - U_TRANSFER_HELPER_MSAA_MAP); + pscreen->resource_create_with_modifiers = + panfrost_resource_create_with_modifiers; + pscreen->resource_create = u_transfer_helper_resource_create; + pscreen->resource_destroy = u_transfer_helper_resource_destroy; + pscreen->resource_from_handle = panfrost_resource_from_handle; + pscreen->resource_get_handle = panfrost_resource_get_handle; + pscreen->resource_get_param = panfrost_resource_get_param; + pscreen->transfer_helper = u_transfer_helper_create( + &transfer_vtbl, + U_TRANSFER_HELPER_SEPARATE_Z32S8 | U_TRANSFER_HELPER_MSAA_MAP); } void panfrost_resource_screen_destroy(struct pipe_screen *pscreen) { - u_transfer_helper_destroy(pscreen->transfer_helper); + u_transfer_helper_destroy(pscreen->transfer_helper); } void panfrost_resource_context_init(struct pipe_context *pctx) { - pctx->buffer_map = u_transfer_helper_transfer_map; - pctx->buffer_unmap = u_transfer_helper_transfer_unmap; - pctx->texture_map = u_transfer_helper_transfer_map; - pctx->texture_unmap = u_transfer_helper_transfer_unmap; - pctx->create_surface = panfrost_create_surface; - pctx->surface_destroy = panfrost_surface_destroy; - pctx->resource_copy_region = util_resource_copy_region; - pctx->blit = panfrost_blit; - pctx->generate_mipmap = panfrost_generate_mipmap; - pctx->flush_resource = panfrost_flush_resource; - pctx->invalidate_resource = panfrost_invalidate_resource; - pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region; - pctx->buffer_subdata = u_default_buffer_subdata; - pctx->texture_subdata = u_default_texture_subdata; - pctx->clear_buffer = u_default_clear_buffer; - pctx->clear_render_target = panfrost_clear_render_target; - pctx->clear_depth_stencil = panfrost_clear_depth_stencil; + pctx->buffer_map = u_transfer_helper_transfer_map; + pctx->buffer_unmap = u_transfer_helper_transfer_unmap; + pctx->texture_map = u_transfer_helper_transfer_map; + pctx->texture_unmap = u_transfer_helper_transfer_unmap; + pctx->create_surface = panfrost_create_surface; + pctx->surface_destroy = panfrost_surface_destroy; + pctx->resource_copy_region = util_resource_copy_region; + pctx->blit = panfrost_blit; + pctx->generate_mipmap = panfrost_generate_mipmap; + pctx->flush_resource = panfrost_flush_resource; + pctx->invalidate_resource = panfrost_invalidate_resource; + pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region; + pctx->buffer_subdata = u_default_buffer_subdata; + pctx->texture_subdata = u_default_texture_subdata; + pctx->clear_buffer = u_default_clear_buffer; + pctx->clear_render_target = panfrost_clear_render_target; + pctx->clear_depth_stencil = panfrost_clear_depth_stencil; } diff --git a/src/gallium/drivers/panfrost/pan_resource.h b/src/gallium/drivers/panfrost/pan_resource.h index eb39726c46e..c3d76d75bf3 100644 --- a/src/gallium/drivers/panfrost/pan_resource.h +++ b/src/gallium/drivers/panfrost/pan_resource.h @@ -22,87 +22,86 @@ * */ - #ifndef PAN_RESOURCE_H #define PAN_RESOURCE_H -#include "pan_screen.h" -#include "pan_minmax_cache.h" -#include "pan_texture.h" #include "drm-uapi/drm.h" #include "util/u_range.h" +#include "pan_minmax_cache.h" +#include "pan_screen.h" +#include "pan_texture.h" #define LAYOUT_CONVERT_THRESHOLD 8 -#define PAN_MAX_BATCHES 32 +#define PAN_MAX_BATCHES 32 -#define PAN_BIND_SHARED_MASK (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | \ - PIPE_BIND_SHARED) +#define PAN_BIND_SHARED_MASK \ + (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED) struct panfrost_resource { - struct pipe_resource base; - struct { - struct pipe_scissor_state extent; - struct { - bool enable; - unsigned stride; - unsigned size; - BITSET_WORD *data; - } tile_map; - } damage; + struct pipe_resource base; + struct { + struct pipe_scissor_state extent; + struct { + bool enable; + unsigned stride; + unsigned size; + BITSET_WORD *data; + } tile_map; + } damage; - struct renderonly_scanout *scanout; + struct renderonly_scanout *scanout; - struct panfrost_resource *separate_stencil; + struct panfrost_resource *separate_stencil; - struct util_range valid_buffer_range; + struct util_range valid_buffer_range; - /* Description of the resource layout */ - struct pan_image image; + /* Description of the resource layout */ + struct pan_image image; - struct { - /* Is the checksum for this image valid? Implicitly refers to - * the first slice; we only checksum non-mipmapped 2D images */ - bool crc; + struct { + /* Is the checksum for this image valid? Implicitly refers to + * the first slice; we only checksum non-mipmapped 2D images */ + bool crc; - /* Has anything been written to this slice? */ - BITSET_DECLARE(data, MAX_MIP_LEVELS); - } valid; + /* Has anything been written to this slice? */ + BITSET_DECLARE(data, MAX_MIP_LEVELS); + } valid; - /* Whether the modifier can be changed */ - bool modifier_constant; + /* Whether the modifier can be changed */ + bool modifier_constant; - /* Used to decide when to convert to another modifier */ - uint16_t modifier_updates; + /* Used to decide when to convert to another modifier */ + uint16_t modifier_updates; - /* Do all pixels have the same stencil value? */ - bool constant_stencil; + /* Do all pixels have the same stencil value? */ + bool constant_stencil; - /* The stencil value if constant_stencil is set */ - uint8_t stencil_value; + /* The stencil value if constant_stencil is set */ + uint8_t stencil_value; - /* Cached min/max values for index buffers */ - struct panfrost_minmax_cache *index_cache; + /* Cached min/max values for index buffers */ + struct panfrost_minmax_cache *index_cache; }; static inline struct panfrost_resource * pan_resource(struct pipe_resource *p) { - return (struct panfrost_resource *)p; + return (struct panfrost_resource *)p; } struct panfrost_transfer { - struct pipe_transfer base; - void *map; - struct { - struct pipe_resource *rsrc; - struct pipe_box box; - } staging; + struct pipe_transfer base; + void *map; + struct { + struct pipe_resource *rsrc; + struct pipe_box box; + } staging; }; static inline struct panfrost_transfer * pan_transfer(struct pipe_transfer *p) { - return (struct panfrost_transfer *)p; + return (struct panfrost_transfer *)p; } void panfrost_resource_screen_init(struct pipe_screen *screen); @@ -113,53 +112,48 @@ void panfrost_resource_context_init(struct pipe_context *pctx); /* Blitting */ -void -panfrost_blitter_save(struct panfrost_context *ctx, bool render_cond); +void panfrost_blitter_save(struct panfrost_context *ctx, bool render_cond); -void -panfrost_blit(struct pipe_context *pipe, - const struct pipe_blit_info *info); +void panfrost_blit(struct pipe_context *pipe, + const struct pipe_blit_info *info); -void -panfrost_resource_set_damage_region(struct pipe_screen *screen, - struct pipe_resource *res, - unsigned int nrects, - const struct pipe_box *rects); +void panfrost_resource_set_damage_region(struct pipe_screen *screen, + struct pipe_resource *res, + unsigned int nrects, + const struct pipe_box *rects); static inline enum mali_texture_dimension -panfrost_translate_texture_dimension(enum pipe_texture_target t) { - switch (t) - { - case PIPE_BUFFER: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return MALI_TEXTURE_DIMENSION_1D; +panfrost_translate_texture_dimension(enum pipe_texture_target t) +{ + switch (t) { + case PIPE_BUFFER: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return MALI_TEXTURE_DIMENSION_1D; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_RECT: - return MALI_TEXTURE_DIMENSION_2D; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_RECT: + return MALI_TEXTURE_DIMENSION_2D; - case PIPE_TEXTURE_3D: - return MALI_TEXTURE_DIMENSION_3D; + case PIPE_TEXTURE_3D: + return MALI_TEXTURE_DIMENSION_3D; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return MALI_TEXTURE_DIMENSION_CUBE; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return MALI_TEXTURE_DIMENSION_CUBE; - default: - unreachable("Unknown target"); - } + default: + unreachable("Unknown target"); + } } -void -pan_resource_modifier_convert(struct panfrost_context *ctx, - struct panfrost_resource *rsrc, - uint64_t modifier, const char *reason); +void pan_resource_modifier_convert(struct panfrost_context *ctx, + struct panfrost_resource *rsrc, + uint64_t modifier, const char *reason); -void -pan_legalize_afbc_format(struct panfrost_context *ctx, - struct panfrost_resource *rsrc, - enum pipe_format format); +void pan_legalize_afbc_format(struct panfrost_context *ctx, + struct panfrost_resource *rsrc, + enum pipe_format format); #endif /* PAN_RESOURCE_H */ diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c index cd83576c9e7..00ad57e03cf 100644 --- a/src/gallium/drivers/panfrost/pan_screen.c +++ b/src/gallium/drivers/panfrost/pan_screen.c @@ -26,31 +26,31 @@ * */ -#include "util/u_debug.h" -#include "util/u_memory.h" -#include "util/format/u_format.h" -#include "util/format/u_format_s3tc.h" -#include "util/u_video.h" -#include "util/u_screen.h" -#include "util/os_time.h" -#include "util/u_process.h" +#include "draw/draw_context.h" #include "pipe/p_defines.h" #include "pipe/p_screen.h" -#include "draw/draw_context.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" +#include "util/os_time.h" +#include "util/u_debug.h" +#include "util/u_memory.h" +#include "util/u_process.h" +#include "util/u_screen.h" +#include "util/u_video.h" #include #include "drm-uapi/drm_fourcc.h" #include "drm-uapi/panfrost_drm.h" +#include "decode.h" #include "pan_bo.h" #include "pan_fence.h" -#include "pan_shader.h" -#include "pan_screen.h" -#include "pan_resource.h" #include "pan_public.h" +#include "pan_resource.h" +#include "pan_screen.h" +#include "pan_shader.h" #include "pan_util.h" -#include "decode.h" #include "pan_context.h" @@ -80,294 +80,294 @@ static const struct debug_named_value panfrost_debug_options[] = { static const char * panfrost_get_name(struct pipe_screen *screen) { - return pan_device(screen)->model->name; + return pan_device(screen)->model->name; } static const char * panfrost_get_vendor(struct pipe_screen *screen) { - return "Mesa"; + return "Mesa"; } static const char * panfrost_get_device_vendor(struct pipe_screen *screen) { - return "Arm"; + return "Arm"; } static int panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param) { - struct panfrost_device *dev = pan_device(screen); + struct panfrost_device *dev = pan_device(screen); - /* Our GL 3.x implementation is WIP */ - bool is_gl3 = dev->debug & (PAN_DBG_GL3 | PAN_DBG_DEQP); + /* Our GL 3.x implementation is WIP */ + bool is_gl3 = dev->debug & (PAN_DBG_GL3 | PAN_DBG_DEQP); - /* Native MRT is introduced with v5 */ - bool has_mrt = (dev->arch >= 5); + /* Native MRT is introduced with v5 */ + bool has_mrt = (dev->arch >= 5); - /* Only kernel drivers >= 1.1 can allocate HEAP BOs */ - bool has_heap = dev->kernel_version->version_major > 1 || - dev->kernel_version->version_minor >= 1; + /* Only kernel drivers >= 1.1 can allocate HEAP BOs */ + bool has_heap = dev->kernel_version->version_major > 1 || + dev->kernel_version->version_minor >= 1; - switch (param) { - case PIPE_CAP_NPOT_TEXTURES: - case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: - case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: - case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: - case PIPE_CAP_DEPTH_CLIP_DISABLE: - case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: - case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: - case PIPE_CAP_FRONTEND_NOOP: - case PIPE_CAP_SAMPLE_SHADING: - case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: - case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: - case PIPE_CAP_SHADER_PACK_HALF_FLOAT: - return 1; + switch (param) { + case PIPE_CAP_NPOT_TEXTURES: + case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: + case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: + case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: + case PIPE_CAP_DEPTH_CLIP_DISABLE: + case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: + case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + case PIPE_CAP_FRONTEND_NOOP: + case PIPE_CAP_SAMPLE_SHADING: + case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + case PIPE_CAP_SHADER_PACK_HALF_FLOAT: + return 1; - case PIPE_CAP_MAX_RENDER_TARGETS: - case PIPE_CAP_FBFETCH: - case PIPE_CAP_FBFETCH_COHERENT: - return has_mrt ? 8 : 1; + case PIPE_CAP_MAX_RENDER_TARGETS: + case PIPE_CAP_FBFETCH: + case PIPE_CAP_FBFETCH_COHERENT: + return has_mrt ? 8 : 1; - case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: - return 1; + case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: + return 1; - case PIPE_CAP_OCCLUSION_QUERY: - case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX: - return true; + case PIPE_CAP_OCCLUSION_QUERY: + case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX: + return true; - case PIPE_CAP_ANISOTROPIC_FILTER: - return dev->revision >= dev->model->min_rev_anisotropic; + case PIPE_CAP_ANISOTROPIC_FILTER: + return dev->revision >= dev->model->min_rev_anisotropic; - /* Compile side is done for Bifrost, Midgard TODO. Needs some kernel - * work to turn on, since CYCLE_COUNT_START needs to be issued. In - * kbase, userspace requests this via BASE_JD_REQ_PERMON. There is not - * yet way to request this with mainline TODO */ - case PIPE_CAP_SHADER_CLOCK: - return 0; + /* Compile side is done for Bifrost, Midgard TODO. Needs some kernel + * work to turn on, since CYCLE_COUNT_START needs to be issued. In + * kbase, userspace requests this via BASE_JD_REQ_PERMON. There is not + * yet way to request this with mainline TODO */ + case PIPE_CAP_SHADER_CLOCK: + return 0; - case PIPE_CAP_VS_INSTANCEID: - case PIPE_CAP_TEXTURE_MULTISAMPLE: - case PIPE_CAP_SURFACE_SAMPLE_COUNT: - return true; + case PIPE_CAP_VS_INSTANCEID: + case PIPE_CAP_TEXTURE_MULTISAMPLE: + case PIPE_CAP_SURFACE_SAMPLE_COUNT: + return true; - case PIPE_CAP_SAMPLER_VIEW_TARGET: - case PIPE_CAP_CLIP_HALFZ: - case PIPE_CAP_TEXTURE_SWIZZLE: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: - case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: - case PIPE_CAP_BLEND_EQUATION_SEPARATE: - case PIPE_CAP_INDEP_BLEND_ENABLE: - case PIPE_CAP_INDEP_BLEND_FUNC: - case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_ACCELERATED: - case PIPE_CAP_UMA: - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: - case PIPE_CAP_SHADER_ARRAY_COMPONENTS: - case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: - case PIPE_CAP_TEXTURE_BUFFER_SAMPLER: - case PIPE_CAP_PACKED_UNIFORMS: - case PIPE_CAP_IMAGE_LOAD_FORMATTED: - case PIPE_CAP_CUBE_MAP_ARRAY: - case PIPE_CAP_COMPUTE: - case PIPE_CAP_INT64: - return 1; + case PIPE_CAP_SAMPLER_VIEW_TARGET: + case PIPE_CAP_CLIP_HALFZ: + case PIPE_CAP_TEXTURE_SWIZZLE: + case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: + case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: + case PIPE_CAP_BLEND_EQUATION_SEPARATE: + case PIPE_CAP_INDEP_BLEND_ENABLE: + case PIPE_CAP_INDEP_BLEND_FUNC: + case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_ACCELERATED: + case PIPE_CAP_UMA: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_SHADER_ARRAY_COMPONENTS: + case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: + case PIPE_CAP_TEXTURE_BUFFER_SAMPLER: + case PIPE_CAP_PACKED_UNIFORMS: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: + case PIPE_CAP_CUBE_MAP_ARRAY: + case PIPE_CAP_COMPUTE: + case PIPE_CAP_INT64: + return 1; - /* We need this for OES_copy_image, but currently there are some awful - * interactions with AFBC that need to be worked out. */ - case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: - return 0; + /* We need this for OES_copy_image, but currently there are some awful + * interactions with AFBC that need to be worked out. */ + case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + return 0; - case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - return PIPE_MAX_SO_BUFFERS; + case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + return PIPE_MAX_SO_BUFFERS; - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - return PIPE_MAX_SO_OUTPUTS; + case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + return PIPE_MAX_SO_OUTPUTS; - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: - case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: - return 1; + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: + return 1; - case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - return 2048; + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + return 2048; - case PIPE_CAP_GLSL_FEATURE_LEVEL: - case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: - return is_gl3 ? 330 : 140; - case PIPE_CAP_ESSL_FEATURE_LEVEL: - return dev->arch >= 6 ? 320 : 310; + case PIPE_CAP_GLSL_FEATURE_LEVEL: + case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: + return is_gl3 ? 330 : 140; + case PIPE_CAP_ESSL_FEATURE_LEVEL: + return dev->arch >= 6 ? 320 : 310; - case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: - return 16; + case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: + return 16; - /* v7 (only) restricts component orders with AFBC. To workaround, we - * compose format swizzles with texture swizzles. pan_texture.c motsly - * handles this but we need to fix up the border colour. - */ - case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: - if (dev->arch == 7) - return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_FREEDRENO; - else - return 0; + /* v7 (only) restricts component orders with AFBC. To workaround, we + * compose format swizzles with texture swizzles. pan_texture.c motsly + * handles this but we need to fix up the border colour. + */ + case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: + if (dev->arch == 7) + return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_FREEDRENO; + else + return 0; - case PIPE_CAP_MAX_TEXEL_BUFFER_ELEMENTS_UINT: - return 65536; + case PIPE_CAP_MAX_TEXEL_BUFFER_ELEMENTS_UINT: + return 65536; - /* Must be at least 64 for correct behaviour */ - case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: - return 64; + /* Must be at least 64 for correct behaviour */ + case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: + return 64; - case PIPE_CAP_QUERY_TIMESTAMP: - return is_gl3; + case PIPE_CAP_QUERY_TIMESTAMP: + return is_gl3; - /* The hardware requires element alignment for data conversion to work - * as expected. If data conversion is not required, this restriction is - * lifted on Midgard at a performance penalty. We conservatively - * require element alignment for vertex buffers, using u_vbuf to - * translate to match the hardware requirement. - * - * This is less heavy-handed than the 4BYTE_ALIGNED_ONLY caps, which - * would needlessly require alignment even for 8-bit formats. - */ - case PIPE_CAP_VERTEX_ATTRIB_ELEMENT_ALIGNED_ONLY: - return 1; + /* The hardware requires element alignment for data conversion to work + * as expected. If data conversion is not required, this restriction is + * lifted on Midgard at a performance penalty. We conservatively + * require element alignment for vertex buffers, using u_vbuf to + * translate to match the hardware requirement. + * + * This is less heavy-handed than the 4BYTE_ALIGNED_ONLY caps, which + * would needlessly require alignment even for 8-bit formats. + */ + case PIPE_CAP_VERTEX_ATTRIB_ELEMENT_ALIGNED_ONLY: + return 1; - case PIPE_CAP_MAX_TEXTURE_2D_SIZE: - return 1 << (MAX_MIP_LEVELS - 1); + case PIPE_CAP_MAX_TEXTURE_2D_SIZE: + return 1 << (MAX_MIP_LEVELS - 1); - case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: - case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: - return MAX_MIP_LEVELS; + case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: + case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: + return MAX_MIP_LEVELS; - case PIPE_CAP_FS_COORD_ORIGIN_LOWER_LEFT: - case PIPE_CAP_FS_COORD_PIXEL_CENTER_INTEGER: - /* Hardware is upper left. Pixel center at (0.5, 0.5) */ - return 0; + case PIPE_CAP_FS_COORD_ORIGIN_LOWER_LEFT: + case PIPE_CAP_FS_COORD_PIXEL_CENTER_INTEGER: + /* Hardware is upper left. Pixel center at (0.5, 0.5) */ + return 0; - case PIPE_CAP_FS_COORD_ORIGIN_UPPER_LEFT: - case PIPE_CAP_FS_COORD_PIXEL_CENTER_HALF_INTEGER: - case PIPE_CAP_TGSI_TEXCOORD: - return 1; + case PIPE_CAP_FS_COORD_ORIGIN_UPPER_LEFT: + case PIPE_CAP_FS_COORD_PIXEL_CENTER_HALF_INTEGER: + case PIPE_CAP_TGSI_TEXCOORD: + return 1; - /* We would prefer varyings on Midgard, but proper sysvals on Bifrost */ - case PIPE_CAP_FS_FACE_IS_INTEGER_SYSVAL: - case PIPE_CAP_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_FS_POINT_IS_SYSVAL: - return dev->arch >= 6; + /* We would prefer varyings on Midgard, but proper sysvals on Bifrost */ + case PIPE_CAP_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_FS_POINT_IS_SYSVAL: + return dev->arch >= 6; - case PIPE_CAP_SEAMLESS_CUBE_MAP: - case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - return true; + case PIPE_CAP_SEAMLESS_CUBE_MAP: + case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: + return true; - case PIPE_CAP_MAX_VERTEX_ELEMENT_SRC_OFFSET: - return 0xffff; + case PIPE_CAP_MAX_VERTEX_ELEMENT_SRC_OFFSET: + return 0xffff; - case PIPE_CAP_TEXTURE_TRANSFER_MODES: - return 0; + case PIPE_CAP_TEXTURE_TRANSFER_MODES: + return 0; - case PIPE_CAP_ENDIANNESS: - return PIPE_ENDIAN_NATIVE; + case PIPE_CAP_ENDIANNESS: + return PIPE_ENDIAN_NATIVE; - case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: - return 4; + case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: + return 4; - case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: - return -8; + case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: + return -8; - case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: - return 7; + case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: + return 7; - case PIPE_CAP_VIDEO_MEMORY: { - uint64_t system_memory; + case PIPE_CAP_VIDEO_MEMORY: { + uint64_t system_memory; - if (!os_get_total_physical_memory(&system_memory)) - return 0; + if (!os_get_total_physical_memory(&system_memory)) + return 0; - return (int)(system_memory >> 20); - } + return (int)(system_memory >> 20); + } - case PIPE_CAP_SHADER_STENCIL_EXPORT: - case PIPE_CAP_CONDITIONAL_RENDER: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: - return true; + case PIPE_CAP_SHADER_STENCIL_EXPORT: + case PIPE_CAP_CONDITIONAL_RENDER: + case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: + return true; - case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: - return 4; + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: + return 4; - case PIPE_CAP_MAX_VARYINGS: - /* Return the GLSL maximum. The internal maximum - * PAN_MAX_VARYINGS accommodates internal varyings. */ - return MAX_VARYING; + case PIPE_CAP_MAX_VARYINGS: + /* Return the GLSL maximum. The internal maximum + * PAN_MAX_VARYINGS accommodates internal varyings. */ + return MAX_VARYING; - /* Removed in v6 (Bifrost) */ - case PIPE_CAP_GL_CLAMP: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP: - case PIPE_CAP_ALPHA_TEST: - return dev->arch <= 5; + /* Removed in v6 (Bifrost) */ + case PIPE_CAP_GL_CLAMP: + case PIPE_CAP_TEXTURE_MIRROR_CLAMP: + case PIPE_CAP_ALPHA_TEST: + return dev->arch <= 5; - /* Removed in v9 (Valhall). PRIMTIIVE_RESTART_FIXED_INDEX is of course - * still supported as it is core GLES3.0 functionality - */ - case PIPE_CAP_PRIMITIVE_RESTART: - return dev->arch <= 7; + /* Removed in v9 (Valhall). PRIMTIIVE_RESTART_FIXED_INDEX is of course + * still supported as it is core GLES3.0 functionality + */ + case PIPE_CAP_PRIMITIVE_RESTART: + return dev->arch <= 7; - case PIPE_CAP_FLATSHADE: - case PIPE_CAP_TWO_SIDED_COLOR: - case PIPE_CAP_CLIP_PLANES: - return 0; + case PIPE_CAP_FLATSHADE: + case PIPE_CAP_TWO_SIDED_COLOR: + case PIPE_CAP_CLIP_PLANES: + return 0; - case PIPE_CAP_PACKED_STREAM_OUTPUT: - return 0; + case PIPE_CAP_PACKED_STREAM_OUTPUT: + return 0; - case PIPE_CAP_VIEWPORT_TRANSFORM_LOWERED: - case PIPE_CAP_PSIZ_CLAMPED: - return 1; + case PIPE_CAP_VIEWPORT_TRANSFORM_LOWERED: + case PIPE_CAP_PSIZ_CLAMPED: + return 1; - case PIPE_CAP_NIR_IMAGES_AS_DEREF: - return 0; + case PIPE_CAP_NIR_IMAGES_AS_DEREF: + return 0; - case PIPE_CAP_DRAW_INDIRECT: - return has_heap; + case PIPE_CAP_DRAW_INDIRECT: + return has_heap; - case PIPE_CAP_START_INSTANCE: - case PIPE_CAP_DRAW_PARAMETERS: - return pan_is_bifrost(dev); + case PIPE_CAP_START_INSTANCE: + case PIPE_CAP_DRAW_PARAMETERS: + return pan_is_bifrost(dev); - case PIPE_CAP_SUPPORTED_PRIM_MODES: - case PIPE_CAP_SUPPORTED_PRIM_MODES_WITH_RESTART: { - /* Mali supports GLES and QUADS. Midgard and v6 Bifrost - * support more */ - uint32_t modes = BITFIELD_MASK(PIPE_PRIM_QUADS + 1); + case PIPE_CAP_SUPPORTED_PRIM_MODES: + case PIPE_CAP_SUPPORTED_PRIM_MODES_WITH_RESTART: { + /* Mali supports GLES and QUADS. Midgard and v6 Bifrost + * support more */ + uint32_t modes = BITFIELD_MASK(PIPE_PRIM_QUADS + 1); - if (dev->arch <= 6) { - modes |= BITFIELD_BIT(PIPE_PRIM_QUAD_STRIP); - modes |= BITFIELD_BIT(PIPE_PRIM_POLYGON); - } + if (dev->arch <= 6) { + modes |= BITFIELD_BIT(PIPE_PRIM_QUAD_STRIP); + modes |= BITFIELD_BIT(PIPE_PRIM_POLYGON); + } - if (dev->arch >= 9) { - /* Although Valhall is supposed to support quads, they - * don't seem to work correctly. Disable to fix - * arb-provoking-vertex-render. - */ - modes &= ~BITFIELD_BIT(PIPE_PRIM_QUADS); - } + if (dev->arch >= 9) { + /* Although Valhall is supposed to support quads, they + * don't seem to work correctly. Disable to fix + * arb-provoking-vertex-render. + */ + modes &= ~BITFIELD_BIT(PIPE_PRIM_QUADS); + } - return modes; - } + return modes; + } - case PIPE_CAP_IMAGE_STORE_FORMATTED: - return 1; + case PIPE_CAP_IMAGE_STORE_FORMATTED: + return 1; - case PIPE_CAP_NATIVE_FENCE_FD: - return 1; + case PIPE_CAP_NATIVE_FENCE_FD: + return 1; - default: - return u_pipe_screen_get_param_defaults(screen, param); - } + default: + return u_pipe_screen_get_param_defaults(screen, param); + } } static int @@ -375,163 +375,163 @@ panfrost_get_shader_param(struct pipe_screen *screen, enum pipe_shader_type shader, enum pipe_shader_cap param) { - struct panfrost_device *dev = pan_device(screen); - bool is_nofp16 = dev->debug & PAN_DBG_NOFP16; - bool is_deqp = dev->debug & PAN_DBG_DEQP; + struct panfrost_device *dev = pan_device(screen); + bool is_nofp16 = dev->debug & PAN_DBG_NOFP16; + bool is_deqp = dev->debug & PAN_DBG_DEQP; - switch (shader) { - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_FRAGMENT: - case PIPE_SHADER_COMPUTE: - break; - default: - return 0; - } + switch (shader) { + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_FRAGMENT: + case PIPE_SHADER_COMPUTE: + break; + default: + return 0; + } - /* We only allow observable side effects (memory writes) in compute and - * fragment shaders. Side effects in the geometry pipeline cause - * trouble with IDVS and conflict with our transform feedback lowering. - */ - bool allow_side_effects = (shader != PIPE_SHADER_VERTEX); + /* We only allow observable side effects (memory writes) in compute and + * fragment shaders. Side effects in the geometry pipeline cause + * trouble with IDVS and conflict with our transform feedback lowering. + */ + bool allow_side_effects = (shader != PIPE_SHADER_VERTEX); - switch (param) { - case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: - return 16384; /* arbitrary */ + switch (param) { + case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: + return 16384; /* arbitrary */ - case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: - return 1024; /* arbitrary */ + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + return 1024; /* arbitrary */ - case PIPE_SHADER_CAP_MAX_INPUTS: - /* Used as ABI on Midgard */ - return 16; + case PIPE_SHADER_CAP_MAX_INPUTS: + /* Used as ABI on Midgard */ + return 16; - case PIPE_SHADER_CAP_MAX_OUTPUTS: - return shader == PIPE_SHADER_FRAGMENT ? 8 : PIPE_MAX_ATTRIBS; + case PIPE_SHADER_CAP_MAX_OUTPUTS: + return shader == PIPE_SHADER_FRAGMENT ? 8 : PIPE_MAX_ATTRIBS; - case PIPE_SHADER_CAP_MAX_TEMPS: - return 256; /* arbitrary */ + case PIPE_SHADER_CAP_MAX_TEMPS: + return 256; /* arbitrary */ - case PIPE_SHADER_CAP_MAX_CONST_BUFFER0_SIZE: - return 16 * 1024 * sizeof(float); + case PIPE_SHADER_CAP_MAX_CONST_BUFFER0_SIZE: + return 16 * 1024 * sizeof(float); - case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: - STATIC_ASSERT(PAN_MAX_CONST_BUFFERS < 0x100); - return PAN_MAX_CONST_BUFFERS; + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + STATIC_ASSERT(PAN_MAX_CONST_BUFFERS < 0x100); + return PAN_MAX_CONST_BUFFERS; - case PIPE_SHADER_CAP_CONT_SUPPORTED: - return 0; + case PIPE_SHADER_CAP_CONT_SUPPORTED: + return 0; - case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: - return 1; - case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: - return 0; + case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + return 1; + case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + return 0; - case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: - return dev->arch >= 6; + case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: + return dev->arch >= 6; - case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: - return 1; + case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + return 1; - case PIPE_SHADER_CAP_SUBROUTINES: - return 0; + case PIPE_SHADER_CAP_SUBROUTINES: + return 0; - case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - return 0; + case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: + return 0; - case PIPE_SHADER_CAP_INTEGERS: - return 1; + case PIPE_SHADER_CAP_INTEGERS: + return 1; - /* The Bifrost compiler supports full 16-bit. Midgard could but int16 - * support is untested, so restrict INT16 to Bifrost. Midgard - * architecturally cannot support fp16 derivatives. */ + /* The Bifrost compiler supports full 16-bit. Midgard could but int16 + * support is untested, so restrict INT16 to Bifrost. Midgard + * architecturally cannot support fp16 derivatives. */ - case PIPE_SHADER_CAP_FP16: - case PIPE_SHADER_CAP_GLSL_16BIT_CONSTS: - return !is_nofp16; - case PIPE_SHADER_CAP_FP16_DERIVATIVES: - case PIPE_SHADER_CAP_FP16_CONST_BUFFERS: - return dev->arch >= 6 && !is_nofp16; - case PIPE_SHADER_CAP_INT16: - /* XXX: Advertise this CAP when a proper fix to lower_precision - * lands. GLSL IR validation failure in glmark2 -bterrain */ - return dev->arch >= 6 && !is_nofp16 && is_deqp; + case PIPE_SHADER_CAP_FP16: + case PIPE_SHADER_CAP_GLSL_16BIT_CONSTS: + return !is_nofp16; + case PIPE_SHADER_CAP_FP16_DERIVATIVES: + case PIPE_SHADER_CAP_FP16_CONST_BUFFERS: + return dev->arch >= 6 && !is_nofp16; + case PIPE_SHADER_CAP_INT16: + /* XXX: Advertise this CAP when a proper fix to lower_precision + * lands. GLSL IR validation failure in glmark2 -bterrain */ + return dev->arch >= 6 && !is_nofp16 && is_deqp; - case PIPE_SHADER_CAP_INT64_ATOMICS: - case PIPE_SHADER_CAP_DROUND_SUPPORTED: - case PIPE_SHADER_CAP_DFRACEXP_DLDEXP_SUPPORTED: - case PIPE_SHADER_CAP_LDEXP_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: - return 0; + case PIPE_SHADER_CAP_INT64_ATOMICS: + case PIPE_SHADER_CAP_DROUND_SUPPORTED: + case PIPE_SHADER_CAP_DFRACEXP_DLDEXP_SUPPORTED: + case PIPE_SHADER_CAP_LDEXP_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + return 0; - case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: - STATIC_ASSERT(PIPE_MAX_SAMPLERS < 0x10000); - return PIPE_MAX_SAMPLERS; + case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: + STATIC_ASSERT(PIPE_MAX_SAMPLERS < 0x10000); + return PIPE_MAX_SAMPLERS; - case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: - STATIC_ASSERT(PIPE_MAX_SHADER_SAMPLER_VIEWS < 0x10000); - return PIPE_MAX_SHADER_SAMPLER_VIEWS; + case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: + STATIC_ASSERT(PIPE_MAX_SHADER_SAMPLER_VIEWS < 0x10000); + return PIPE_MAX_SHADER_SAMPLER_VIEWS; - case PIPE_SHADER_CAP_PREFERRED_IR: - return PIPE_SHADER_IR_NIR; + case PIPE_SHADER_CAP_PREFERRED_IR: + return PIPE_SHADER_IR_NIR; - case PIPE_SHADER_CAP_SUPPORTED_IRS: - return (1 << PIPE_SHADER_IR_NIR); + case PIPE_SHADER_CAP_SUPPORTED_IRS: + return (1 << PIPE_SHADER_IR_NIR); - case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - return allow_side_effects ? 16 : 0; + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + return allow_side_effects ? 16 : 0; - case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: - return allow_side_effects ? PIPE_MAX_SHADER_IMAGES : 0; + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + return allow_side_effects ? PIPE_MAX_SHADER_IMAGES : 0; - case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: - case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: - return 0; + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: + return 0; - default: - return 0; - } + default: + return 0; + } - return 0; + return 0; } static float panfrost_get_paramf(struct pipe_screen *screen, enum pipe_capf param) { - switch (param) { - case PIPE_CAPF_MIN_LINE_WIDTH: - case PIPE_CAPF_MIN_LINE_WIDTH_AA: - case PIPE_CAPF_MIN_POINT_SIZE: - case PIPE_CAPF_MIN_POINT_SIZE_AA: - return 1; + switch (param) { + case PIPE_CAPF_MIN_LINE_WIDTH: + case PIPE_CAPF_MIN_LINE_WIDTH_AA: + case PIPE_CAPF_MIN_POINT_SIZE: + case PIPE_CAPF_MIN_POINT_SIZE_AA: + return 1; - case PIPE_CAPF_POINT_SIZE_GRANULARITY: - case PIPE_CAPF_LINE_WIDTH_GRANULARITY: - return 0.0625; + case PIPE_CAPF_POINT_SIZE_GRANULARITY: + case PIPE_CAPF_LINE_WIDTH_GRANULARITY: + return 0.0625; - case PIPE_CAPF_MAX_LINE_WIDTH: - case PIPE_CAPF_MAX_LINE_WIDTH_AA: - case PIPE_CAPF_MAX_POINT_SIZE: - case PIPE_CAPF_MAX_POINT_SIZE_AA: - return 4095.9375; + case PIPE_CAPF_MAX_LINE_WIDTH: + case PIPE_CAPF_MAX_LINE_WIDTH_AA: + case PIPE_CAPF_MAX_POINT_SIZE: + case PIPE_CAPF_MAX_POINT_SIZE_AA: + return 4095.9375; - case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: - return 16.0; + case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: + return 16.0; - case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: - return 16.0; /* arbitrary */ + case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: + return 16.0; /* arbitrary */ - case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: - return 0.0f; + case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: + return 0.0f; - default: - debug_printf("Unexpected PIPE_CAPF %d query\n", param); - return 0.0; - } + default: + debug_printf("Unexpected PIPE_CAPF %d query\n", param); + return 0.0; + } } /** @@ -540,69 +540,64 @@ panfrost_get_paramf(struct pipe_screen *screen, enum pipe_capf param) * \param type one of PIPE_TEXTURE, PIPE_SURFACE */ static bool -panfrost_is_format_supported( struct pipe_screen *screen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned bind) +panfrost_is_format_supported(struct pipe_screen *screen, + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, unsigned bind) { - struct panfrost_device *dev = pan_device(screen); + struct panfrost_device *dev = pan_device(screen); - assert(target == PIPE_BUFFER || - target == PIPE_TEXTURE_1D || - target == PIPE_TEXTURE_1D_ARRAY || - target == PIPE_TEXTURE_2D || - target == PIPE_TEXTURE_2D_ARRAY || - target == PIPE_TEXTURE_RECT || - target == PIPE_TEXTURE_3D || - target == PIPE_TEXTURE_CUBE || - target == PIPE_TEXTURE_CUBE_ARRAY); + assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D || + target == PIPE_TEXTURE_1D_ARRAY || target == PIPE_TEXTURE_2D || + target == PIPE_TEXTURE_2D_ARRAY || target == PIPE_TEXTURE_RECT || + target == PIPE_TEXTURE_3D || target == PIPE_TEXTURE_CUBE || + target == PIPE_TEXTURE_CUBE_ARRAY); - /* MSAA 2x gets rounded up to 4x. MSAA 8x/16x only supported on v5+. - * TODO: debug MSAA 8x/16x */ + /* MSAA 2x gets rounded up to 4x. MSAA 8x/16x only supported on v5+. + * TODO: debug MSAA 8x/16x */ - switch (sample_count) { - case 0: - case 1: - case 4: - break; - case 8: - case 16: - if (dev->debug & PAN_DBG_MSAA16) - break; - else - return false; - default: - return false; - } + switch (sample_count) { + case 0: + case 1: + case 4: + break; + case 8: + case 16: + if (dev->debug & PAN_DBG_MSAA16) + break; + else + return false; + default: + return false; + } - if (MAX2(sample_count, 1) != MAX2(storage_sample_count, 1)) - return false; + if (MAX2(sample_count, 1) != MAX2(storage_sample_count, 1)) + return false; - /* Z16 causes dEQP failures on t720 */ - if (format == PIPE_FORMAT_Z16_UNORM && dev->arch <= 4) - return false; + /* Z16 causes dEQP failures on t720 */ + if (format == PIPE_FORMAT_Z16_UNORM && dev->arch <= 4) + return false; - /* Check we support the format with the given bind */ + /* Check we support the format with the given bind */ - unsigned relevant_bind = bind & - ( PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET - | PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_SAMPLER_VIEW); + unsigned relevant_bind = + bind & (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET | + PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_SAMPLER_VIEW); - struct panfrost_format fmt = dev->formats[format]; + struct panfrost_format fmt = dev->formats[format]; - /* Also check that compressed texture formats are supported on this - * particular chip. They may not be depending on system integration - * differences. */ + /* Also check that compressed texture formats are supported on this + * particular chip. They may not be depending on system integration + * differences. */ - bool supported = panfrost_supports_compressed_format(dev, - MALI_EXTRACT_INDEX(fmt.hw)); + bool supported = + panfrost_supports_compressed_format(dev, MALI_EXTRACT_INDEX(fmt.hw)); - if (!supported) - return false; + if (!supported) + return false; - return MALI_EXTRACT_INDEX(fmt.hw) && ((relevant_bind & ~fmt.bind) == 0); + return MALI_EXTRACT_INDEX(fmt.hw) && ((relevant_bind & ~fmt.bind) == 0); } /* We always support linear and tiled operations, both external and internal. @@ -611,175 +606,180 @@ panfrost_is_format_supported( struct pipe_screen *screen, static void panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen, - enum pipe_format format, int max, uint64_t *modifiers, unsigned - int *external_only, int *out_count, uint64_t test_modifier) + enum pipe_format format, int max, + uint64_t *modifiers, unsigned int *external_only, + int *out_count, uint64_t test_modifier) { - /* Query AFBC status */ - struct panfrost_device *dev = pan_device(screen); - bool afbc = dev->has_afbc && panfrost_format_supports_afbc(dev, format); - bool ytr = panfrost_afbc_can_ytr(format); - bool tiled_afbc = panfrost_afbc_can_tile(dev); + /* Query AFBC status */ + struct panfrost_device *dev = pan_device(screen); + bool afbc = dev->has_afbc && panfrost_format_supports_afbc(dev, format); + bool ytr = panfrost_afbc_can_ytr(format); + bool tiled_afbc = panfrost_afbc_can_tile(dev); - unsigned count = 0; + unsigned count = 0; - for (unsigned i = 0; i < PAN_MODIFIER_COUNT; ++i) { - if (drm_is_afbc(pan_best_modifiers[i]) && !afbc) - continue; + for (unsigned i = 0; i < PAN_MODIFIER_COUNT; ++i) { + if (drm_is_afbc(pan_best_modifiers[i]) && !afbc) + continue; - if ((pan_best_modifiers[i] & AFBC_FORMAT_MOD_YTR) && !ytr) - continue; + if ((pan_best_modifiers[i] & AFBC_FORMAT_MOD_YTR) && !ytr) + continue; - if ((pan_best_modifiers[i] & AFBC_FORMAT_MOD_TILED) && !tiled_afbc) - continue; + if ((pan_best_modifiers[i] & AFBC_FORMAT_MOD_TILED) && !tiled_afbc) + continue; - if (test_modifier != DRM_FORMAT_MOD_INVALID && - test_modifier != pan_best_modifiers[i]) - continue; + if (test_modifier != DRM_FORMAT_MOD_INVALID && + test_modifier != pan_best_modifiers[i]) + continue; - count++; + count++; - if (max > (int) count) { - modifiers[count] = pan_best_modifiers[i]; + if (max > (int)count) { + modifiers[count] = pan_best_modifiers[i]; - if (external_only) - external_only[count] = false; - } - } + if (external_only) + external_only[count] = false; + } + } - *out_count = count; + *out_count = count; } static void panfrost_query_dmabuf_modifiers(struct pipe_screen *screen, - enum pipe_format format, int max, uint64_t *modifiers, unsigned - int *external_only, int *out_count) + enum pipe_format format, int max, + uint64_t *modifiers, + unsigned int *external_only, int *out_count) { - panfrost_walk_dmabuf_modifiers(screen, format, max, modifiers, - external_only, out_count, DRM_FORMAT_MOD_INVALID); + panfrost_walk_dmabuf_modifiers(screen, format, max, modifiers, external_only, + out_count, DRM_FORMAT_MOD_INVALID); } static bool panfrost_is_dmabuf_modifier_supported(struct pipe_screen *screen, - uint64_t modifier, enum pipe_format format, - bool *external_only) + uint64_t modifier, + enum pipe_format format, + bool *external_only) { - uint64_t unused; - unsigned int uint_extern_only = 0; - int count; + uint64_t unused; + unsigned int uint_extern_only = 0; + int count; - panfrost_walk_dmabuf_modifiers(screen, format, 1, &unused, - &uint_extern_only, &count, modifier); + panfrost_walk_dmabuf_modifiers(screen, format, 1, &unused, &uint_extern_only, + &count, modifier); - if (external_only) - *external_only = uint_extern_only ? true : false; + if (external_only) + *external_only = uint_extern_only ? true : false; - return count > 0; + return count > 0; } static int -panfrost_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type, - enum pipe_compute_cap param, void *ret) +panfrost_get_compute_param(struct pipe_screen *pscreen, + enum pipe_shader_ir ir_type, + enum pipe_compute_cap param, void *ret) { - struct panfrost_device *dev = pan_device(pscreen); - const char * const ir = "panfrost"; + struct panfrost_device *dev = pan_device(pscreen); + const char *const ir = "panfrost"; -#define RET(x) do { \ - if (ret) \ - memcpy(ret, x, sizeof(x)); \ - return sizeof(x); \ -} while (0) +#define RET(x) \ + do { \ + if (ret) \ + memcpy(ret, x, sizeof(x)); \ + return sizeof(x); \ + } while (0) - switch (param) { - case PIPE_COMPUTE_CAP_ADDRESS_BITS: - RET((uint32_t []){ 64 }); + switch (param) { + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + RET((uint32_t[]){64}); - case PIPE_COMPUTE_CAP_IR_TARGET: - if (ret) - sprintf(ret, "%s", ir); - return strlen(ir) * sizeof(char); + case PIPE_COMPUTE_CAP_IR_TARGET: + if (ret) + sprintf(ret, "%s", ir); + return strlen(ir) * sizeof(char); - case PIPE_COMPUTE_CAP_GRID_DIMENSION: - RET((uint64_t []) { 3 }); + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + RET((uint64_t[]){3}); - case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: - RET(((uint64_t []) { 65535, 65535, 65535 })); + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + RET(((uint64_t[]){65535, 65535, 65535})); - case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: - /* Unpredictable behaviour at larger sizes. Mali-G52 advertises - * 384x384x384. - * - * On Midgard, we don't allow more than 128 threads in each - * direction to match PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK. - * That still exceeds the minimum-maximum. - */ - if (dev->arch >= 6) - RET(((uint64_t []) { 256, 256, 256 })); - else - RET(((uint64_t []) { 128, 128, 128 })); + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + /* Unpredictable behaviour at larger sizes. Mali-G52 advertises + * 384x384x384. + * + * On Midgard, we don't allow more than 128 threads in each + * direction to match PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK. + * That still exceeds the minimum-maximum. + */ + if (dev->arch >= 6) + RET(((uint64_t[]){256, 256, 256})); + else + RET(((uint64_t[]){128, 128, 128})); - case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: - /* On Bifrost and newer, all GPUs can support at least 256 threads - * regardless of register usage, so we report 256. - * - * On Midgard, with maximum register usage, the maximum - * thread count is only 64. We would like to report 64 here, but - * the GLES3.1 spec minimum is 128, so we report 128 and limit - * the register allocation of affected compute kernels. - */ - RET((uint64_t []) { dev->arch >= 6 ? 256 : 128 }); + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + /* On Bifrost and newer, all GPUs can support at least 256 threads + * regardless of register usage, so we report 256. + * + * On Midgard, with maximum register usage, the maximum + * thread count is only 64. We would like to report 64 here, but + * the GLES3.1 spec minimum is 128, so we report 128 and limit + * the register allocation of affected compute kernels. + */ + RET((uint64_t[]){dev->arch >= 6 ? 256 : 128}); - case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: - RET((uint64_t []) { 1024*1024*512 /* Maybe get memory */ }); + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + RET((uint64_t[]){1024 * 1024 * 512 /* Maybe get memory */}); - case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: - RET((uint64_t []) { 32768 }); + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + RET((uint64_t[]){32768}); - case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: - case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: - RET((uint64_t []) { 4096 }); + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + RET((uint64_t[]){4096}); - case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: - RET((uint64_t []) { 1024*1024*512 /* Maybe get memory */ }); + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: + RET((uint64_t[]){1024 * 1024 * 512 /* Maybe get memory */}); - case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: - RET((uint32_t []) { 800 /* MHz -- TODO */ }); + case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: + RET((uint32_t[]){800 /* MHz -- TODO */}); - case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: - RET((uint32_t []) { dev->core_count }); + case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: + RET((uint32_t[]){dev->core_count}); - case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: - RET((uint32_t []) { 1 }); + case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: + RET((uint32_t[]){1}); - case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: - RET((uint32_t []) { pan_subgroup_size(dev->arch) }); + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + RET((uint32_t[]){pan_subgroup_size(dev->arch)}); - case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: - RET((uint64_t []) { 1024 }); // TODO - } + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + RET((uint64_t[]){1024}); // TODO + } - return 0; + return 0; } static void panfrost_destroy_screen(struct pipe_screen *pscreen) { - struct panfrost_device *dev = pan_device(pscreen); - struct panfrost_screen *screen = pan_screen(pscreen); + struct panfrost_device *dev = pan_device(pscreen); + struct panfrost_screen *screen = pan_screen(pscreen); - panfrost_resource_screen_destroy(pscreen); - panfrost_pool_cleanup(&screen->blitter.bin_pool); - panfrost_pool_cleanup(&screen->blitter.desc_pool); - pan_blend_shaders_cleanup(dev); + panfrost_resource_screen_destroy(pscreen); + panfrost_pool_cleanup(&screen->blitter.bin_pool); + panfrost_pool_cleanup(&screen->blitter.desc_pool); + pan_blend_shaders_cleanup(dev); - if (screen->vtbl.screen_destroy) - screen->vtbl.screen_destroy(pscreen); + if (screen->vtbl.screen_destroy) + screen->vtbl.screen_destroy(pscreen); - if (dev->ro) - dev->ro->destroy(dev->ro); - panfrost_close_device(dev); + if (dev->ro) + dev->ro->destroy(dev->ro); + panfrost_close_device(dev); - disk_cache_destroy(screen->disk_cache); - ralloc_free(pscreen); + disk_cache_destroy(screen->disk_cache); + ralloc_free(pscreen); } static const void * @@ -787,104 +787,104 @@ panfrost_screen_get_compiler_options(struct pipe_screen *pscreen, enum pipe_shader_ir ir, enum pipe_shader_type shader) { - return pan_screen(pscreen)->vtbl.get_compiler_options(); + return pan_screen(pscreen)->vtbl.get_compiler_options(); } static struct disk_cache * panfrost_get_disk_shader_cache(struct pipe_screen *pscreen) { - return pan_screen(pscreen)->disk_cache; + return pan_screen(pscreen)->disk_cache; } int panfrost_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, struct pipe_driver_query_info *info) { - int num_queries = ARRAY_SIZE(panfrost_driver_query_list); + int num_queries = ARRAY_SIZE(panfrost_driver_query_list); - if (!info) - return num_queries; + if (!info) + return num_queries; - if (index >= num_queries) - return 0; + if (index >= num_queries) + return 0; - *info = panfrost_driver_query_list[index]; + *info = panfrost_driver_query_list[index]; - return 1; + return 1; } - struct pipe_screen * panfrost_create_screen(int fd, struct renderonly *ro) { - /* Create the screen */ - struct panfrost_screen *screen = rzalloc(NULL, struct panfrost_screen); + /* Create the screen */ + struct panfrost_screen *screen = rzalloc(NULL, struct panfrost_screen); - if (!screen) - return NULL; + if (!screen) + return NULL; - struct panfrost_device *dev = pan_device(&screen->base); + struct panfrost_device *dev = pan_device(&screen->base); - /* Debug must be set first for pandecode to work correctly */ - dev->debug = debug_get_flags_option("PAN_MESA_DEBUG", panfrost_debug_options, 0); - panfrost_open_device(screen, fd, dev); + /* Debug must be set first for pandecode to work correctly */ + dev->debug = + debug_get_flags_option("PAN_MESA_DEBUG", panfrost_debug_options, 0); + panfrost_open_device(screen, fd, dev); - if (dev->debug & PAN_DBG_NO_AFBC) - dev->has_afbc = false; + if (dev->debug & PAN_DBG_NO_AFBC) + dev->has_afbc = false; - /* Bail early on unsupported hardware */ - if (dev->model == NULL) { - debug_printf("panfrost: Unsupported model %X", dev->gpu_id); - panfrost_destroy_screen(&(screen->base)); - return NULL; - } + /* Bail early on unsupported hardware */ + if (dev->model == NULL) { + debug_printf("panfrost: Unsupported model %X", dev->gpu_id); + panfrost_destroy_screen(&(screen->base)); + return NULL; + } - dev->ro = ro; + dev->ro = ro; - screen->base.destroy = panfrost_destroy_screen; + screen->base.destroy = panfrost_destroy_screen; - screen->base.get_name = panfrost_get_name; - screen->base.get_vendor = panfrost_get_vendor; - screen->base.get_device_vendor = panfrost_get_device_vendor; - screen->base.get_driver_query_info = panfrost_get_driver_query_info; - screen->base.get_param = panfrost_get_param; - screen->base.get_shader_param = panfrost_get_shader_param; - screen->base.get_compute_param = panfrost_get_compute_param; - screen->base.get_paramf = panfrost_get_paramf; - screen->base.get_timestamp = u_default_get_timestamp; - screen->base.is_format_supported = panfrost_is_format_supported; - screen->base.query_dmabuf_modifiers = panfrost_query_dmabuf_modifiers; - screen->base.is_dmabuf_modifier_supported = - panfrost_is_dmabuf_modifier_supported; - screen->base.context_create = panfrost_create_context; - screen->base.get_compiler_options = panfrost_screen_get_compiler_options; - screen->base.get_disk_shader_cache = panfrost_get_disk_shader_cache; - screen->base.fence_reference = panfrost_fence_reference; - screen->base.fence_finish = panfrost_fence_finish; - screen->base.fence_get_fd = panfrost_fence_get_fd; - screen->base.set_damage_region = panfrost_resource_set_damage_region; + screen->base.get_name = panfrost_get_name; + screen->base.get_vendor = panfrost_get_vendor; + screen->base.get_device_vendor = panfrost_get_device_vendor; + screen->base.get_driver_query_info = panfrost_get_driver_query_info; + screen->base.get_param = panfrost_get_param; + screen->base.get_shader_param = panfrost_get_shader_param; + screen->base.get_compute_param = panfrost_get_compute_param; + screen->base.get_paramf = panfrost_get_paramf; + screen->base.get_timestamp = u_default_get_timestamp; + screen->base.is_format_supported = panfrost_is_format_supported; + screen->base.query_dmabuf_modifiers = panfrost_query_dmabuf_modifiers; + screen->base.is_dmabuf_modifier_supported = + panfrost_is_dmabuf_modifier_supported; + screen->base.context_create = panfrost_create_context; + screen->base.get_compiler_options = panfrost_screen_get_compiler_options; + screen->base.get_disk_shader_cache = panfrost_get_disk_shader_cache; + screen->base.fence_reference = panfrost_fence_reference; + screen->base.fence_finish = panfrost_fence_finish; + screen->base.fence_get_fd = panfrost_fence_get_fd; + screen->base.set_damage_region = panfrost_resource_set_damage_region; - panfrost_resource_screen_init(&screen->base); - pan_blend_shaders_init(dev); + panfrost_resource_screen_init(&screen->base); + pan_blend_shaders_init(dev); - panfrost_disk_cache_init(screen); + panfrost_disk_cache_init(screen); - panfrost_pool_init(&screen->blitter.bin_pool, NULL, dev, PAN_BO_EXECUTE, - 4096, "Blitter shaders", false, true); - panfrost_pool_init(&screen->blitter.desc_pool, NULL, dev, 0, 65536, - "Blitter RSDs", false, true); - if (dev->arch == 4) - panfrost_cmdstream_screen_init_v4(screen); - else if (dev->arch == 5) - panfrost_cmdstream_screen_init_v5(screen); - else if (dev->arch == 6) - panfrost_cmdstream_screen_init_v6(screen); - else if (dev->arch == 7) - panfrost_cmdstream_screen_init_v7(screen); - else if (dev->arch == 9) - panfrost_cmdstream_screen_init_v9(screen); - else - unreachable("Unhandled architecture major"); + panfrost_pool_init(&screen->blitter.bin_pool, NULL, dev, PAN_BO_EXECUTE, + 4096, "Blitter shaders", false, true); + panfrost_pool_init(&screen->blitter.desc_pool, NULL, dev, 0, 65536, + "Blitter RSDs", false, true); + if (dev->arch == 4) + panfrost_cmdstream_screen_init_v4(screen); + else if (dev->arch == 5) + panfrost_cmdstream_screen_init_v5(screen); + else if (dev->arch == 6) + panfrost_cmdstream_screen_init_v6(screen); + else if (dev->arch == 7) + panfrost_cmdstream_screen_init_v7(screen); + else if (dev->arch == 9) + panfrost_cmdstream_screen_init_v9(screen); + else + unreachable("Unhandled architecture major"); - return &screen->base; + return &screen->base; } diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h index f3f7df41892..f813725d7d7 100644 --- a/src/gallium/drivers/panfrost/pan_screen.h +++ b/src/gallium/drivers/panfrost/pan_screen.h @@ -30,14 +30,14 @@ #define PAN_SCREEN_H #include -#include "pipe/p_screen.h" #include "pipe/p_defines.h" +#include "pipe/p_screen.h" #include "renderonly/renderonly.h" -#include "util/u_dynarray.h" #include "util/bitset.h" -#include "util/set.h" -#include "util/log.h" #include "util/disk_cache.h" +#include "util/log.h" +#include "util/set.h" +#include "util/u_dynarray.h" #include "pan_device.h" #include "pan_mempool.h" @@ -45,7 +45,7 @@ #define PAN_QUERY_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0) static const struct pipe_driver_query_info panfrost_driver_query_list[] = { - {"draw-calls", PAN_QUERY_DRAW_CALLS, { 0 }}, + {"draw-calls", PAN_QUERY_DRAW_CALLS, {0}}, }; struct panfrost_batch; @@ -58,77 +58,74 @@ struct pan_blend_state; /* Virtual table of per-generation (GenXML) functions */ struct panfrost_vtable { - /* Prepares the renderer state descriptor or shader program descriptor - * for a given compiled shader, and if desired uploads it as well */ - void (*prepare_shader)(struct panfrost_compiled_shader *, - struct panfrost_pool *, bool); + /* Prepares the renderer state descriptor or shader program descriptor + * for a given compiled shader, and if desired uploads it as well */ + void (*prepare_shader)(struct panfrost_compiled_shader *, + struct panfrost_pool *, bool); - /* Emits a thread local storage descriptor */ - void (*emit_tls)(struct panfrost_batch *); + /* Emits a thread local storage descriptor */ + void (*emit_tls)(struct panfrost_batch *); - /* Emits a framebuffer descriptor */ - void (*emit_fbd)(struct panfrost_batch *, const struct pan_fb_info *); + /* Emits a framebuffer descriptor */ + void (*emit_fbd)(struct panfrost_batch *, const struct pan_fb_info *); - /* Emits a fragment job */ - mali_ptr (*emit_fragment_job)(struct panfrost_batch *, const struct pan_fb_info *); + /* Emits a fragment job */ + mali_ptr (*emit_fragment_job)(struct panfrost_batch *, + const struct pan_fb_info *); - /* General destructor */ - void (*screen_destroy)(struct pipe_screen *); + /* General destructor */ + void (*screen_destroy)(struct pipe_screen *); - /* Preload framebuffer */ - void (*preload)(struct panfrost_batch *, struct pan_fb_info *); + /* Preload framebuffer */ + void (*preload)(struct panfrost_batch *, struct pan_fb_info *); - /* Initialize a Gallium context */ - void (*context_init)(struct pipe_context *pipe); + /* Initialize a Gallium context */ + void (*context_init)(struct pipe_context *pipe); - /* Device-dependent initialization of a panfrost_batch */ - void (*init_batch)(struct panfrost_batch *batch); + /* Device-dependent initialization of a panfrost_batch */ + void (*init_batch)(struct panfrost_batch *batch); - /* Get blend shader */ - struct pan_blend_shader_variant * - (*get_blend_shader)(const struct panfrost_device *, - const struct pan_blend_state *, - nir_alu_type, nir_alu_type, - unsigned rt); + /* Get blend shader */ + struct pan_blend_shader_variant *(*get_blend_shader)( + const struct panfrost_device *, const struct pan_blend_state *, + nir_alu_type, nir_alu_type, unsigned rt); - /* Initialize the polygon list */ - void (*init_polygon_list)(struct panfrost_batch *); + /* Initialize the polygon list */ + void (*init_polygon_list)(struct panfrost_batch *); - /* Shader compilation methods */ - const nir_shader_compiler_options *(*get_compiler_options)(void); - void (*compile_shader)(nir_shader *s, - struct panfrost_compile_inputs *inputs, - struct util_dynarray *binary, - struct pan_shader_info *info); + /* Shader compilation methods */ + const nir_shader_compiler_options *(*get_compiler_options)(void); + void (*compile_shader)(nir_shader *s, struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info); }; struct panfrost_screen { - struct pipe_screen base; - struct panfrost_device dev; - struct { - struct panfrost_pool bin_pool; - struct panfrost_pool desc_pool; - } blitter; + struct pipe_screen base; + struct panfrost_device dev; + struct { + struct panfrost_pool bin_pool; + struct panfrost_pool desc_pool; + } blitter; - struct panfrost_vtable vtbl; - struct disk_cache *disk_cache; + struct panfrost_vtable vtbl; + struct disk_cache *disk_cache; }; static inline struct panfrost_screen * pan_screen(struct pipe_screen *p) { - return (struct panfrost_screen *)p; + return (struct panfrost_screen *)p; } static inline struct panfrost_device * pan_device(struct pipe_screen *p) { - return &(pan_screen(p)->dev); + return &(pan_screen(p)->dev); } -int -panfrost_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, - struct pipe_driver_query_info *info); +int panfrost_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info); void panfrost_cmdstream_screen_init_v4(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v5(struct panfrost_screen *screen); @@ -136,13 +133,13 @@ void panfrost_cmdstream_screen_init_v6(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v7(struct panfrost_screen *screen); void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen); -#define perf_debug(dev, ...) \ - do { \ - if (unlikely((dev)->debug & PAN_DBG_PERF)) \ - mesa_logw(__VA_ARGS__); \ - } while(0) +#define perf_debug(dev, ...) \ + do { \ + if (unlikely((dev)->debug & PAN_DBG_PERF)) \ + mesa_logw(__VA_ARGS__); \ + } while (0) -#define perf_debug_ctx(ctx, ...) \ - perf_debug(pan_device((ctx)->base.screen), __VA_ARGS__); +#define perf_debug_ctx(ctx, ...) \ + perf_debug(pan_device((ctx)->base.screen), __VA_ARGS__); #endif /* PAN_SCREEN_H */ diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index e77343a65ac..e3935651219 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -28,103 +28,96 @@ * */ -#include "pan_context.h" -#include "pan_bo.h" #include "pan_shader.h" -#include "util/u_memory.h" #include "nir/tgsi_to_nir.h" +#include "util/u_memory.h" #include "nir_serialize.h" +#include "pan_bo.h" +#include "pan_context.h" static struct panfrost_uncompiled_shader * panfrost_alloc_shader(const nir_shader *nir) { - struct panfrost_uncompiled_shader *so = - rzalloc(NULL, struct panfrost_uncompiled_shader); + struct panfrost_uncompiled_shader *so = + rzalloc(NULL, struct panfrost_uncompiled_shader); - simple_mtx_init(&so->lock, mtx_plain); - util_dynarray_init(&so->variants, so); + simple_mtx_init(&so->lock, mtx_plain); + util_dynarray_init(&so->variants, so); - so->nir = nir; + so->nir = nir; - /* Serialize the NIR to a binary blob that we can hash for the disk - * cache. Drop unnecessary information (like variable names) so the - * serialized NIR is smaller, and also to let us detect more isomorphic - * shaders when hashing, increasing cache hits. - */ - struct blob blob; - blob_init(&blob); - nir_serialize(&blob, nir, true); - _mesa_sha1_compute(blob.data, blob.size, so->nir_sha1); - blob_finish(&blob); + /* Serialize the NIR to a binary blob that we can hash for the disk + * cache. Drop unnecessary information (like variable names) so the + * serialized NIR is smaller, and also to let us detect more isomorphic + * shaders when hashing, increasing cache hits. + */ + struct blob blob; + blob_init(&blob); + nir_serialize(&blob, nir, true); + _mesa_sha1_compute(blob.data, blob.size, so->nir_sha1); + blob_finish(&blob); - return so; + return so; } static struct panfrost_compiled_shader * panfrost_alloc_variant(struct panfrost_uncompiled_shader *so) { - return util_dynarray_grow(&so->variants, struct panfrost_compiled_shader, 1); + return util_dynarray_grow(&so->variants, struct panfrost_compiled_shader, 1); } static void -panfrost_shader_compile(struct panfrost_screen *screen, - const nir_shader *ir, +panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, struct util_debug_callback *dbg, - struct panfrost_shader_key *key, - unsigned req_local_mem, + struct panfrost_shader_key *key, unsigned req_local_mem, unsigned fixed_varying_mask, struct panfrost_shader_binary *out) { - struct panfrost_device *dev = pan_device(&screen->base); + struct panfrost_device *dev = pan_device(&screen->base); - nir_shader *s = nir_shader_clone(NULL, ir); + nir_shader *s = nir_shader_clone(NULL, ir); - struct panfrost_compile_inputs inputs = { - .debug = dbg, - .gpu_id = dev->gpu_id, - .fixed_sysval_ubo = -1, - }; + struct panfrost_compile_inputs inputs = { + .debug = dbg, + .gpu_id = dev->gpu_id, + .fixed_sysval_ubo = -1, + }; - /* Lower this early so the backends don't have to worry about it */ - if (s->info.stage == MESA_SHADER_FRAGMENT) { - inputs.fixed_varying_mask = key->fs.fixed_varying_mask; + /* Lower this early so the backends don't have to worry about it */ + if (s->info.stage == MESA_SHADER_FRAGMENT) { + inputs.fixed_varying_mask = key->fs.fixed_varying_mask; - if (s->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) { - NIR_PASS_V(s, nir_lower_fragcolor, - key->fs.nr_cbufs_for_fragcolor); - } + if (s->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) { + NIR_PASS_V(s, nir_lower_fragcolor, key->fs.nr_cbufs_for_fragcolor); + } - if (key->fs.sprite_coord_enable) { - NIR_PASS_V(s, nir_lower_texcoord_replace, - key->fs.sprite_coord_enable, - true /* point coord is sysval */, - false /* Y-invert */); - } + if (key->fs.sprite_coord_enable) { + NIR_PASS_V(s, nir_lower_texcoord_replace, key->fs.sprite_coord_enable, + true /* point coord is sysval */, false /* Y-invert */); + } - if (key->fs.clip_plane_enable) { - NIR_PASS_V(s, nir_lower_clip_fs, - key->fs.clip_plane_enable, - false); - } + if (key->fs.clip_plane_enable) { + NIR_PASS_V(s, nir_lower_clip_fs, key->fs.clip_plane_enable, false); + } - memcpy(inputs.rt_formats, key->fs.rt_formats, sizeof(inputs.rt_formats)); - } else if (s->info.stage == MESA_SHADER_VERTEX) { - inputs.fixed_varying_mask = fixed_varying_mask; + memcpy(inputs.rt_formats, key->fs.rt_formats, sizeof(inputs.rt_formats)); + } else if (s->info.stage == MESA_SHADER_VERTEX) { + inputs.fixed_varying_mask = fixed_varying_mask; - /* No IDVS for internal XFB shaders */ - inputs.no_idvs = s->info.has_transform_feedback_varyings; - } + /* No IDVS for internal XFB shaders */ + inputs.no_idvs = s->info.has_transform_feedback_varyings; + } - util_dynarray_init(&out->binary, NULL); - screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info); + util_dynarray_init(&out->binary, NULL); + screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info); - assert(req_local_mem >= out->info.wls_size); - out->info.wls_size = req_local_mem; + assert(req_local_mem >= out->info.wls_size); + out->info.wls_size = req_local_mem; - /* In both clone and tgsi_to_nir paths, the shader is ralloc'd against - * a NULL context - */ - ralloc_free(s); + /* In both clone and tgsi_to_nir paths, the shader is ralloc'd against + * a NULL context + */ + ralloc_free(s); } static void @@ -136,287 +129,288 @@ panfrost_shader_get(struct pipe_screen *pscreen, struct panfrost_compiled_shader *state, unsigned req_local_mem) { - struct panfrost_screen *screen = pan_screen(pscreen); - struct panfrost_device *dev = pan_device(pscreen); + struct panfrost_screen *screen = pan_screen(pscreen); + struct panfrost_device *dev = pan_device(pscreen); - struct panfrost_shader_binary res = { 0 }; + struct panfrost_shader_binary res = {0}; - /* Try to retrieve the variant from the disk cache. If that fails, - * compile a new variant and store in the disk cache for later reuse. - */ - if (!panfrost_disk_cache_retrieve(screen->disk_cache, uncompiled, &state->key, &res)) { - panfrost_shader_compile(screen, uncompiled->nir, dbg, &state->key, - req_local_mem, - uncompiled->fixed_varying_mask, &res); + /* Try to retrieve the variant from the disk cache. If that fails, + * compile a new variant and store in the disk cache for later reuse. + */ + if (!panfrost_disk_cache_retrieve(screen->disk_cache, uncompiled, + &state->key, &res)) { + panfrost_shader_compile(screen, uncompiled->nir, dbg, &state->key, + req_local_mem, uncompiled->fixed_varying_mask, + &res); - panfrost_disk_cache_store(screen->disk_cache, uncompiled, &state->key, &res); - } + panfrost_disk_cache_store(screen->disk_cache, uncompiled, &state->key, + &res); + } - state->info = res.info; + state->info = res.info; - if (res.binary.size) { - state->bin = panfrost_pool_take_ref(shader_pool, - pan_pool_upload_aligned(&shader_pool->base, - res.binary.data, res.binary.size, 128)); - } + if (res.binary.size) { + state->bin = panfrost_pool_take_ref( + shader_pool, + pan_pool_upload_aligned(&shader_pool->base, res.binary.data, + res.binary.size, 128)); + } - util_dynarray_fini(&res.binary); + util_dynarray_fini(&res.binary); - /* Don't upload RSD for fragment shaders since they need draw-time - * merging for e.g. depth/stencil/alpha. RSDs are replaced by simpler - * shader program descriptors on Valhall, which can be preuploaded even - * for fragment shaders. */ - bool upload = !(uncompiled->nir->info.stage == MESA_SHADER_FRAGMENT && dev->arch <= 7); - screen->vtbl.prepare_shader(state, desc_pool, upload); + /* Don't upload RSD for fragment shaders since they need draw-time + * merging for e.g. depth/stencil/alpha. RSDs are replaced by simpler + * shader program descriptors on Valhall, which can be preuploaded even + * for fragment shaders. */ + bool upload = + !(uncompiled->nir->info.stage == MESA_SHADER_FRAGMENT && dev->arch <= 7); + screen->vtbl.prepare_shader(state, desc_pool, upload); - panfrost_analyze_sysvals(state); + panfrost_analyze_sysvals(state); } static void panfrost_build_key(struct panfrost_context *ctx, - struct panfrost_shader_key *key, - const nir_shader *nir) + struct panfrost_shader_key *key, const nir_shader *nir) { - /* We don't currently have vertex shader variants */ - if (nir->info.stage != MESA_SHADER_FRAGMENT) - return; + /* We don't currently have vertex shader variants */ + if (nir->info.stage != MESA_SHADER_FRAGMENT) + return; - struct panfrost_device *dev = pan_device(ctx->base.screen); - struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer; - struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer; - struct panfrost_uncompiled_shader *vs = ctx->uncompiled[MESA_SHADER_VERTEX]; + struct panfrost_device *dev = pan_device(ctx->base.screen); + struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer; + struct pipe_rasterizer_state *rast = (void *)ctx->rasterizer; + struct panfrost_uncompiled_shader *vs = ctx->uncompiled[MESA_SHADER_VERTEX]; - /* gl_FragColor lowering needs the number of colour buffers */ - if (nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) { - key->fs.nr_cbufs_for_fragcolor = fb->nr_cbufs; - } + /* gl_FragColor lowering needs the number of colour buffers */ + if (nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) { + key->fs.nr_cbufs_for_fragcolor = fb->nr_cbufs; + } - /* Point sprite lowering needed on Bifrost and newer */ - if (dev->arch >= 6 && rast && ctx->active_prim == PIPE_PRIM_POINTS) { - key->fs.sprite_coord_enable = rast->sprite_coord_enable; - } + /* Point sprite lowering needed on Bifrost and newer */ + if (dev->arch >= 6 && rast && ctx->active_prim == PIPE_PRIM_POINTS) { + key->fs.sprite_coord_enable = rast->sprite_coord_enable; + } - /* User clip plane lowering needed everywhere */ - if (rast) { - key->fs.clip_plane_enable = rast->clip_plane_enable; - } + /* User clip plane lowering needed everywhere */ + if (rast) { + key->fs.clip_plane_enable = rast->clip_plane_enable; + } - if (dev->arch <= 5) { - u_foreach_bit(i, (nir->info.outputs_read >> FRAG_RESULT_DATA0)) { - enum pipe_format fmt = PIPE_FORMAT_R8G8B8A8_UNORM; + if (dev->arch <= 5) { + u_foreach_bit(i, (nir->info.outputs_read >> FRAG_RESULT_DATA0)) { + enum pipe_format fmt = PIPE_FORMAT_R8G8B8A8_UNORM; - if ((fb->nr_cbufs > i) && fb->cbufs[i]) - fmt = fb->cbufs[i]->format; + if ((fb->nr_cbufs > i) && fb->cbufs[i]) + fmt = fb->cbufs[i]->format; - if (panfrost_blendable_formats_v6[fmt].internal) - fmt = PIPE_FORMAT_NONE; + if (panfrost_blendable_formats_v6[fmt].internal) + fmt = PIPE_FORMAT_NONE; - key->fs.rt_formats[i] = fmt; - } - } + key->fs.rt_formats[i] = fmt; + } + } - /* Funny desktop GL varying lowering on Valhall */ - if (dev->arch >= 9) { - assert(vs != NULL && "too early"); - key->fs.fixed_varying_mask = vs->fixed_varying_mask; - } + /* Funny desktop GL varying lowering on Valhall */ + if (dev->arch >= 9) { + assert(vs != NULL && "too early"); + key->fs.fixed_varying_mask = vs->fixed_varying_mask; + } } static struct panfrost_compiled_shader * -panfrost_new_variant_locked( - struct panfrost_context *ctx, - struct panfrost_uncompiled_shader *uncompiled, - struct panfrost_shader_key *key) +panfrost_new_variant_locked(struct panfrost_context *ctx, + struct panfrost_uncompiled_shader *uncompiled, + struct panfrost_shader_key *key) { - struct panfrost_compiled_shader *prog = panfrost_alloc_variant(uncompiled); + struct panfrost_compiled_shader *prog = panfrost_alloc_variant(uncompiled); - *prog = (struct panfrost_compiled_shader) { - .key = *key, - .stream_output = uncompiled->stream_output, - }; + *prog = (struct panfrost_compiled_shader){ + .key = *key, + .stream_output = uncompiled->stream_output, + }; - panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs, - uncompiled, &ctx->base.debug, prog, 0); + panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs, uncompiled, + &ctx->base.debug, prog, 0); - prog->earlyzs = pan_earlyzs_analyze(&prog->info); + prog->earlyzs = pan_earlyzs_analyze(&prog->info); - return prog; + return prog; } static void -panfrost_bind_shader_state( - struct pipe_context *pctx, - void *hwcso, - enum pipe_shader_type type) +panfrost_bind_shader_state(struct pipe_context *pctx, void *hwcso, + enum pipe_shader_type type) { - struct panfrost_context *ctx = pan_context(pctx); - ctx->uncompiled[type] = hwcso; - ctx->prog[type] = NULL; + struct panfrost_context *ctx = pan_context(pctx); + ctx->uncompiled[type] = hwcso; + ctx->prog[type] = NULL; - ctx->dirty |= PAN_DIRTY_TLS_SIZE; - ctx->dirty_shader[type] |= PAN_DIRTY_STAGE_SHADER; + ctx->dirty |= PAN_DIRTY_TLS_SIZE; + ctx->dirty_shader[type] |= PAN_DIRTY_STAGE_SHADER; - if (hwcso) - panfrost_update_shader_variant(ctx, type); + if (hwcso) + panfrost_update_shader_variant(ctx, type); } void panfrost_update_shader_variant(struct panfrost_context *ctx, enum pipe_shader_type type) { - /* No shader variants for compute */ - if (type == PIPE_SHADER_COMPUTE) - return; + /* No shader variants for compute */ + if (type == PIPE_SHADER_COMPUTE) + return; - /* We need linking information, defer this */ - if (type == PIPE_SHADER_FRAGMENT && !ctx->uncompiled[PIPE_SHADER_VERTEX]) - return; + /* We need linking information, defer this */ + if (type == PIPE_SHADER_FRAGMENT && !ctx->uncompiled[PIPE_SHADER_VERTEX]) + return; - /* Also defer, happens with GALLIUM_HUD */ - if (!ctx->uncompiled[type]) - return; + /* Also defer, happens with GALLIUM_HUD */ + if (!ctx->uncompiled[type]) + return; - /* Match the appropriate variant */ - struct panfrost_uncompiled_shader *uncompiled = ctx->uncompiled[type]; - struct panfrost_compiled_shader *compiled = NULL; + /* Match the appropriate variant */ + struct panfrost_uncompiled_shader *uncompiled = ctx->uncompiled[type]; + struct panfrost_compiled_shader *compiled = NULL; - simple_mtx_lock(&uncompiled->lock); + simple_mtx_lock(&uncompiled->lock); - struct panfrost_shader_key key = { 0 }; - panfrost_build_key(ctx, &key, uncompiled->nir); + struct panfrost_shader_key key = {0}; + panfrost_build_key(ctx, &key, uncompiled->nir); - util_dynarray_foreach(&uncompiled->variants, struct panfrost_compiled_shader, so) { - if (memcmp(&key, &so->key, sizeof(key)) == 0) { - compiled = so; - break; - } - } + util_dynarray_foreach(&uncompiled->variants, struct panfrost_compiled_shader, + so) { + if (memcmp(&key, &so->key, sizeof(key)) == 0) { + compiled = so; + break; + } + } - if (compiled == NULL) - compiled = panfrost_new_variant_locked(ctx, uncompiled, &key); + if (compiled == NULL) + compiled = panfrost_new_variant_locked(ctx, uncompiled, &key); - ctx->prog[type] = compiled; + ctx->prog[type] = compiled; - /* TODO: it would be more efficient to release the lock before - * compiling instead of after, but that can race if thread A compiles a - * variant while thread B searches for that same variant */ - simple_mtx_unlock(&uncompiled->lock); + /* TODO: it would be more efficient to release the lock before + * compiling instead of after, but that can race if thread A compiles a + * variant while thread B searches for that same variant */ + simple_mtx_unlock(&uncompiled->lock); } static void panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso) { - panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX); + panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX); - /* Fragment shaders are linked with vertex shaders */ - struct panfrost_context *ctx = pan_context(pctx); - panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT); + /* Fragment shaders are linked with vertex shaders */ + struct panfrost_context *ctx = pan_context(pctx); + panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT); } static void panfrost_bind_fs_state(struct pipe_context *pctx, void *hwcso) { - panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_FRAGMENT); + panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_FRAGMENT); } static void * -panfrost_create_shader_state( - struct pipe_context *pctx, - const struct pipe_shader_state *cso) +panfrost_create_shader_state(struct pipe_context *pctx, + const struct pipe_shader_state *cso) { - nir_shader *nir = (cso->type == PIPE_SHADER_IR_TGSI) ? - tgsi_to_nir(cso->tokens, pctx->screen, false) : - cso->ir.nir; + nir_shader *nir = (cso->type == PIPE_SHADER_IR_TGSI) + ? tgsi_to_nir(cso->tokens, pctx->screen, false) + : cso->ir.nir; - struct panfrost_uncompiled_shader *so = panfrost_alloc_shader(nir); + struct panfrost_uncompiled_shader *so = panfrost_alloc_shader(nir); - /* The driver gets ownership of the nir_shader for graphics. The NIR is - * ralloc'd. Free the NIR when we free the uncompiled shader. - */ - ralloc_steal(so, nir); + /* The driver gets ownership of the nir_shader for graphics. The NIR is + * ralloc'd. Free the NIR when we free the uncompiled shader. + */ + ralloc_steal(so, nir); - so->stream_output = cso->stream_output; - so->nir = nir; + so->stream_output = cso->stream_output; + so->nir = nir; - /* Fix linkage early */ - if (so->nir->info.stage == MESA_SHADER_VERTEX) { - so->fixed_varying_mask = - (so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) & - ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ; - } + /* Fix linkage early */ + if (so->nir->info.stage == MESA_SHADER_VERTEX) { + so->fixed_varying_mask = + (so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) & + ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ; + } - /* If this shader uses transform feedback, compile the transform - * feedback program. This is a special shader variant. - */ - struct panfrost_context *ctx = pan_context(pctx); + /* If this shader uses transform feedback, compile the transform + * feedback program. This is a special shader variant. + */ + struct panfrost_context *ctx = pan_context(pctx); - if (so->nir->xfb_info) { - nir_shader *xfb = nir_shader_clone(NULL, so->nir); - xfb->info.name = ralloc_asprintf(xfb, "%s@xfb", xfb->info.name); - xfb->info.internal = true; + if (so->nir->xfb_info) { + nir_shader *xfb = nir_shader_clone(NULL, so->nir); + xfb->info.name = ralloc_asprintf(xfb, "%s@xfb", xfb->info.name); + xfb->info.internal = true; - so->xfb = calloc(1, sizeof(struct panfrost_compiled_shader)); - so->xfb->key.vs_is_xfb = true; + so->xfb = calloc(1, sizeof(struct panfrost_compiled_shader)); + so->xfb->key.vs_is_xfb = true; - panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs, - so, &ctx->base.debug, so->xfb, 0); + panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs, so, + &ctx->base.debug, so->xfb, 0); - /* Since transform feedback is handled via the transform - * feedback program, the original program no longer uses XFB - */ - nir->info.has_transform_feedback_varyings = false; - } + /* Since transform feedback is handled via the transform + * feedback program, the original program no longer uses XFB + */ + nir->info.has_transform_feedback_varyings = false; + } - /* Compile the program. We don't use vertex shader keys, so there will - * be no further vertex shader variants. We do have fragment shader - * keys, but we can still compile with a default key that will work most - * of the time. - */ - struct panfrost_shader_key key = { 0 }; + /* Compile the program. We don't use vertex shader keys, so there will + * be no further vertex shader variants. We do have fragment shader + * keys, but we can still compile with a default key that will work most + * of the time. + */ + struct panfrost_shader_key key = {0}; - /* gl_FragColor lowering needs the number of colour buffers on desktop - * GL, where it acts as an implicit broadcast to all colour buffers. - * - * However, gl_FragColor is a legacy feature, so assume that if - * gl_FragColor is used, there is only a single render target. The - * implicit broadcast is neither especially useful nor required by GLES. - */ - if (so->nir->info.stage == MESA_SHADER_FRAGMENT && - so->nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) { + /* gl_FragColor lowering needs the number of colour buffers on desktop + * GL, where it acts as an implicit broadcast to all colour buffers. + * + * However, gl_FragColor is a legacy feature, so assume that if + * gl_FragColor is used, there is only a single render target. The + * implicit broadcast is neither especially useful nor required by GLES. + */ + if (so->nir->info.stage == MESA_SHADER_FRAGMENT && + so->nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) { - key.fs.nr_cbufs_for_fragcolor = 1; - } + key.fs.nr_cbufs_for_fragcolor = 1; + } - /* Creating a CSO is single-threaded, so it's ok to use the - * locked function without explicitly taking the lock. Creating a - * default variant acts as a precompile. - */ - panfrost_new_variant_locked(ctx, so, &key); + /* Creating a CSO is single-threaded, so it's ok to use the + * locked function without explicitly taking the lock. Creating a + * default variant acts as a precompile. + */ + panfrost_new_variant_locked(ctx, so, &key); - return so; + return so; } static void panfrost_delete_shader_state(struct pipe_context *pctx, void *so) { - struct panfrost_uncompiled_shader *cso = (struct panfrost_uncompiled_shader *) so; + struct panfrost_uncompiled_shader *cso = + (struct panfrost_uncompiled_shader *)so; - util_dynarray_foreach(&cso->variants, struct panfrost_compiled_shader, so) { - panfrost_bo_unreference(so->bin.bo); - panfrost_bo_unreference(so->state.bo); - panfrost_bo_unreference(so->linkage.bo); - } + util_dynarray_foreach(&cso->variants, struct panfrost_compiled_shader, so) { + panfrost_bo_unreference(so->bin.bo); + panfrost_bo_unreference(so->state.bo); + panfrost_bo_unreference(so->linkage.bo); + } - if (cso->xfb) { - panfrost_bo_unreference(cso->xfb->bin.bo); - panfrost_bo_unreference(cso->xfb->state.bo); - panfrost_bo_unreference(cso->xfb->linkage.bo); - free(cso->xfb); - } + if (cso->xfb) { + panfrost_bo_unreference(cso->xfb->bin.bo); + panfrost_bo_unreference(cso->xfb->state.bo); + panfrost_bo_unreference(cso->xfb->linkage.bo); + free(cso->xfb); + } - simple_mtx_destroy(&cso->lock); + simple_mtx_destroy(&cso->lock); - ralloc_free(so); + ralloc_free(so); } /* @@ -424,52 +418,51 @@ panfrost_delete_shader_state(struct pipe_context *pctx, void *so) * precompiled, creating both the uncompiled and compiled shaders now. */ static void * -panfrost_create_compute_state( - struct pipe_context *pctx, - const struct pipe_compute_state *cso) +panfrost_create_compute_state(struct pipe_context *pctx, + const struct pipe_compute_state *cso) { - struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_uncompiled_shader *so = panfrost_alloc_shader(cso->prog); - struct panfrost_compiled_shader *v = panfrost_alloc_variant(so); - memset(v, 0, sizeof *v); + struct panfrost_context *ctx = pan_context(pctx); + struct panfrost_uncompiled_shader *so = panfrost_alloc_shader(cso->prog); + struct panfrost_compiled_shader *v = panfrost_alloc_variant(so); + memset(v, 0, sizeof *v); - assert(cso->ir_type == PIPE_SHADER_IR_NIR && "TGSI kernels unsupported"); + assert(cso->ir_type == PIPE_SHADER_IR_NIR && "TGSI kernels unsupported"); - panfrost_shader_get(pctx->screen, &ctx->shaders, &ctx->descs, - so, &ctx->base.debug, v, cso->static_shared_mem); + panfrost_shader_get(pctx->screen, &ctx->shaders, &ctx->descs, so, + &ctx->base.debug, v, cso->static_shared_mem); - /* The NIR becomes invalid after this. For compute kernels, we never - * need to access it again. Don't keep a dangling pointer around. - */ - so->nir = NULL; + /* The NIR becomes invalid after this. For compute kernels, we never + * need to access it again. Don't keep a dangling pointer around. + */ + so->nir = NULL; - return so; + return so; } static void panfrost_bind_compute_state(struct pipe_context *pipe, void *cso) { - struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_uncompiled_shader *uncompiled = cso; + struct panfrost_context *ctx = pan_context(pipe); + struct panfrost_uncompiled_shader *uncompiled = cso; - ctx->uncompiled[PIPE_SHADER_COMPUTE] = uncompiled; + ctx->uncompiled[PIPE_SHADER_COMPUTE] = uncompiled; - ctx->prog[PIPE_SHADER_COMPUTE] = - uncompiled ? util_dynarray_begin(&uncompiled->variants) : NULL; + ctx->prog[PIPE_SHADER_COMPUTE] = + uncompiled ? util_dynarray_begin(&uncompiled->variants) : NULL; } void panfrost_shader_context_init(struct pipe_context *pctx) { - pctx->create_vs_state = panfrost_create_shader_state; - pctx->delete_vs_state = panfrost_delete_shader_state; - pctx->bind_vs_state = panfrost_bind_vs_state; + pctx->create_vs_state = panfrost_create_shader_state; + pctx->delete_vs_state = panfrost_delete_shader_state; + pctx->bind_vs_state = panfrost_bind_vs_state; - pctx->create_fs_state = panfrost_create_shader_state; - pctx->delete_fs_state = panfrost_delete_shader_state; - pctx->bind_fs_state = panfrost_bind_fs_state; + pctx->create_fs_state = panfrost_create_shader_state; + pctx->delete_fs_state = panfrost_delete_shader_state; + pctx->bind_fs_state = panfrost_bind_fs_state; - pctx->create_compute_state = panfrost_create_compute_state; - pctx->bind_compute_state = panfrost_bind_compute_state; - pctx->delete_compute_state = panfrost_delete_shader_state; + pctx->create_compute_state = panfrost_create_compute_state; + pctx->bind_compute_state = panfrost_bind_compute_state; + pctx->delete_compute_state = panfrost_delete_shader_state; } diff --git a/src/panfrost/bifrost/bi_helper_invocations.c b/src/panfrost/bifrost/bi_helper_invocations.c index f5207d9afa8..f266cbf172f 100644 --- a/src/panfrost/bifrost/bi_helper_invocations.c +++ b/src/panfrost/bifrost/bi_helper_invocations.c @@ -64,20 +64,20 @@ static bool bi_has_skip_bit(enum bi_opcode op) { - switch (op) { - case BI_OPCODE_TEX_SINGLE: - case BI_OPCODE_TEXC: - case BI_OPCODE_TEXC_DUAL: - case BI_OPCODE_TEXS_2D_F16: - case BI_OPCODE_TEXS_2D_F32: - case BI_OPCODE_TEXS_CUBE_F16: - case BI_OPCODE_TEXS_CUBE_F32: - case BI_OPCODE_VAR_TEX_F16: - case BI_OPCODE_VAR_TEX_F32: - return true; - default: - return false; - } + switch (op) { + case BI_OPCODE_TEX_SINGLE: + case BI_OPCODE_TEXC: + case BI_OPCODE_TEXC_DUAL: + case BI_OPCODE_TEXS_2D_F16: + case BI_OPCODE_TEXS_2D_F32: + case BI_OPCODE_TEXS_CUBE_F16: + case BI_OPCODE_TEXS_CUBE_F32: + case BI_OPCODE_VAR_TEX_F16: + case BI_OPCODE_VAR_TEX_F32: + return true; + default: + return false; + } } /* Does a given instruction require helper threads to be active (because it @@ -87,52 +87,52 @@ bi_has_skip_bit(enum bi_opcode op) bool bi_instr_uses_helpers(bi_instr *I) { - switch (I->op) { - case BI_OPCODE_TEXC: - case BI_OPCODE_TEXC_DUAL: - case BI_OPCODE_TEXS_2D_F16: - case BI_OPCODE_TEXS_2D_F32: - case BI_OPCODE_TEXS_CUBE_F16: - case BI_OPCODE_TEXS_CUBE_F32: - case BI_OPCODE_VAR_TEX_F16: - case BI_OPCODE_VAR_TEX_F32: - return !I->lod_mode; /* set for zero, clear for computed */ - case BI_OPCODE_TEX_SINGLE: - return (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_LOD) || - (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_BIAS); - case BI_OPCODE_CLPER_I32: - case BI_OPCODE_CLPER_OLD_I32: - /* Fragment shaders require helpers to implement derivatives. - * Other shader stages don't have helpers at all */ - return true; - default: - return false; - } + switch (I->op) { + case BI_OPCODE_TEXC: + case BI_OPCODE_TEXC_DUAL: + case BI_OPCODE_TEXS_2D_F16: + case BI_OPCODE_TEXS_2D_F32: + case BI_OPCODE_TEXS_CUBE_F16: + case BI_OPCODE_TEXS_CUBE_F32: + case BI_OPCODE_VAR_TEX_F16: + case BI_OPCODE_VAR_TEX_F32: + return !I->lod_mode; /* set for zero, clear for computed */ + case BI_OPCODE_TEX_SINGLE: + return (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_LOD) || + (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_BIAS); + case BI_OPCODE_CLPER_I32: + case BI_OPCODE_CLPER_OLD_I32: + /* Fragment shaders require helpers to implement derivatives. + * Other shader stages don't have helpers at all */ + return true; + default: + return false; + } } /* Does a block use helpers directly */ static bool bi_block_uses_helpers(bi_block *block) { - bi_foreach_instr_in_block(block, I) { - if (bi_instr_uses_helpers(I)) - return true; - } + bi_foreach_instr_in_block(block, I) { + if (bi_instr_uses_helpers(I)) + return true; + } - return false; + return false; } bool bi_block_terminates_helpers(bi_block *block) { - /* Can't terminate if a successor needs helpers */ - bi_foreach_successor(block, succ) { - if (succ->pass_flags & 1) - return false; - } + /* Can't terminate if a successor needs helpers */ + bi_foreach_successor(block, succ) { + if (succ->pass_flags & 1) + return false; + } - /* Otherwise we terminate */ - return true; + /* Otherwise we terminate */ + return true; } /* @@ -142,128 +142,130 @@ bi_block_terminates_helpers(bi_block *block) static void bi_propagate_pass_flag(bi_block *block) { - block->pass_flags = 1; + block->pass_flags = 1; - bi_foreach_predecessor(block, pred) { - if ((*pred)->pass_flags == 0) - bi_propagate_pass_flag(*pred); - } + bi_foreach_predecessor(block, pred) { + if ((*pred)->pass_flags == 0) + bi_propagate_pass_flag(*pred); + } } void bi_analyze_helper_terminate(bi_context *ctx) { - /* Other shader stages do not have a notion of helper threads, so we - * can skip the analysis. Don't run for blend shaders, either, since - * they run in the context of another shader that we don't see. */ - if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend) - return; + /* Other shader stages do not have a notion of helper threads, so we + * can skip the analysis. Don't run for blend shaders, either, since + * they run in the context of another shader that we don't see. */ + if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend) + return; - /* Clear flags */ - bi_foreach_block(ctx, block) - block->pass_flags = 0; + /* Clear flags */ + bi_foreach_block(ctx, block) + block->pass_flags = 0; - /* For each block, check if it uses helpers and propagate that fact if - * so. We walk in reverse order to minimize the number of blocks tested: - * if the (unique) last block uses helpers, only that block is tested. - */ - bi_foreach_block_rev(ctx, block) { - if (block->pass_flags == 0 && bi_block_uses_helpers(block)) - bi_propagate_pass_flag(block); - } + /* For each block, check if it uses helpers and propagate that fact if + * so. We walk in reverse order to minimize the number of blocks tested: + * if the (unique) last block uses helpers, only that block is tested. + */ + bi_foreach_block_rev(ctx, block) { + if (block->pass_flags == 0 && bi_block_uses_helpers(block)) + bi_propagate_pass_flag(block); + } } void bi_mark_clauses_td(bi_context *ctx) { - if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend) - return; + if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend) + return; - /* Finally, mark clauses requiring helpers */ - bi_foreach_block(ctx, block) { - /* At the end, there are helpers iff we don't terminate */ - bool helpers = !bi_block_terminates_helpers(block); + /* Finally, mark clauses requiring helpers */ + bi_foreach_block(ctx, block) { + /* At the end, there are helpers iff we don't terminate */ + bool helpers = !bi_block_terminates_helpers(block); - bi_foreach_clause_in_block_rev(block, clause) { - bi_foreach_instr_in_clause_rev(block, clause, I) { - helpers |= bi_instr_uses_helpers(I); - } + bi_foreach_clause_in_block_rev(block, clause) { + bi_foreach_instr_in_clause_rev(block, clause, I) { + helpers |= bi_instr_uses_helpers(I); + } - clause->td = !helpers; - } - } + clause->td = !helpers; + } + } } static bool bi_helper_block_update(BITSET_WORD *deps, bi_block *block) { - bool progress = false; + bool progress = false; - bi_foreach_instr_in_block_rev(block, I) { - /* If a destination is required by helper invocation... */ - bi_foreach_dest(I, d) { - if (!BITSET_TEST(deps, I->dest[d].value)) - continue; + bi_foreach_instr_in_block_rev(block, I) { + /* If a destination is required by helper invocation... */ + bi_foreach_dest(I, d) { + if (!BITSET_TEST(deps, I->dest[d].value)) + continue; - /* ...so are the sources */ - bi_foreach_ssa_src(I, s) { - progress |= !BITSET_TEST(deps, I->src[s].value); - BITSET_SET(deps, I->src[s].value); - } + /* ...so are the sources */ + bi_foreach_ssa_src(I, s) { + progress |= !BITSET_TEST(deps, I->src[s].value); + BITSET_SET(deps, I->src[s].value); + } - break; - } - } + break; + } + } - return progress; + return progress; } void bi_analyze_helper_requirements(bi_context *ctx) { - BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), ctx->ssa_alloc); + BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), ctx->ssa_alloc); - /* Initialize with the sources of instructions consuming - * derivatives */ + /* Initialize with the sources of instructions consuming + * derivatives */ - bi_foreach_instr_global(ctx, I) { - if (!bi_instr_uses_helpers(I)) continue; + bi_foreach_instr_global(ctx, I) { + if (!bi_instr_uses_helpers(I)) + continue; - bi_foreach_ssa_src(I, s) - BITSET_SET(deps, I->src[s].value); - } + bi_foreach_ssa_src(I, s) + BITSET_SET(deps, I->src[s].value); + } - /* Propagate that up */ - u_worklist worklist; - bi_worklist_init(ctx, &worklist); + /* Propagate that up */ + u_worklist worklist; + bi_worklist_init(ctx, &worklist); - bi_foreach_block(ctx, block) { - bi_worklist_push_tail(&worklist, block); - } + bi_foreach_block(ctx, block) { + bi_worklist_push_tail(&worklist, block); + } - while (!u_worklist_is_empty(&worklist)) { - bi_block *blk = bi_worklist_pop_tail(&worklist); + while (!u_worklist_is_empty(&worklist)) { + bi_block *blk = bi_worklist_pop_tail(&worklist); - if (bi_helper_block_update(deps, blk)) { - bi_foreach_predecessor(blk, pred) - bi_worklist_push_head(&worklist, *pred); - } - } + if (bi_helper_block_update(deps, blk)) { + bi_foreach_predecessor(blk, pred) + bi_worklist_push_head(&worklist, *pred); + } + } - u_worklist_fini(&worklist); + u_worklist_fini(&worklist); - /* Set the execute bits */ + /* Set the execute bits */ - bi_foreach_instr_global(ctx, I) { - if (!bi_has_skip_bit(I->op)) continue; + bi_foreach_instr_global(ctx, I) { + if (!bi_has_skip_bit(I->op)) + continue; - bool exec = false; + bool exec = false; - bi_foreach_dest(I, d) - exec |= BITSET_TEST(deps, I->dest[d].value); + bi_foreach_dest(I, d) + exec |= BITSET_TEST(deps, I->dest[d].value); - I->skip = !exec; - } + I->skip = !exec; + } - free(deps); + free(deps); } diff --git a/src/panfrost/bifrost/bi_layout.c b/src/panfrost/bifrost/bi_layout.c index 7c034cb31be..e90a3603d8d 100644 --- a/src/panfrost/bifrost/bi_layout.c +++ b/src/panfrost/bifrost/bi_layout.c @@ -37,10 +37,8 @@ bool bi_ec0_packed(unsigned tuple_count) { - return (tuple_count == 3) || - (tuple_count == 5) || - (tuple_count == 6) || - (tuple_count == 8); + return (tuple_count == 3) || (tuple_count == 5) || (tuple_count == 6) || + (tuple_count == 8); } /* Helper to calculate the number of quadwords in a clause. This is a function @@ -60,7 +58,7 @@ bi_ec0_packed(unsigned tuple_count) * 6 | 5* * 7 | 5 * 8 | 6* - * + * * Y = { X if X <= 3 * { X - 1 if 4 <= X <= 6 * { X - 2 if 7 <= X <= 8 @@ -72,15 +70,15 @@ bi_ec0_packed(unsigned tuple_count) static unsigned bi_clause_quadwords(bi_clause *clause) { - unsigned X = clause->tuple_count; - unsigned Y = X - ((X >= 7) ? 2 : (X >= 4) ? 1 : 0); + unsigned X = clause->tuple_count; + unsigned Y = X - ((X >= 7) ? 2 : (X >= 4) ? 1 : 0); - unsigned constants = clause->constant_count; + unsigned constants = clause->constant_count; - if ((X != 4) && (X != 7) && (X >= 3) && constants) - constants--; + if ((X != 4) && (X != 7) && (X >= 3) && constants) + constants--; - return Y + DIV_ROUND_UP(constants, 2); + return Y + DIV_ROUND_UP(constants, 2); } /* Measures the number of quadwords a branch jumps. Bifrost relative offsets @@ -90,62 +88,62 @@ bi_clause_quadwords(bi_clause *clause) signed bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target) { - /* Signed since we might jump backwards */ - signed ret = 0; + /* Signed since we might jump backwards */ + signed ret = 0; - /* Determine if the block we're branching to is strictly greater in - * source order */ - bool forwards = target->index > start->block->index; + /* Determine if the block we're branching to is strictly greater in + * source order */ + bool forwards = target->index > start->block->index; - if (forwards) { - /* We have to jump through this block from the start of this - * clause to the end */ - bi_foreach_clause_in_block_from(start->block, clause, start) { - ret += bi_clause_quadwords(clause); - } + if (forwards) { + /* We have to jump through this block from the start of this + * clause to the end */ + bi_foreach_clause_in_block_from(start->block, clause, start) { + ret += bi_clause_quadwords(clause); + } - /* We then need to jump through every clause of every following - * block until the target */ - bi_foreach_block_from(ctx, start->block, blk) { - /* Don't double-count the first block */ - if (blk == start->block) - continue; + /* We then need to jump through every clause of every following + * block until the target */ + bi_foreach_block_from(ctx, start->block, blk) { + /* Don't double-count the first block */ + if (blk == start->block) + continue; - /* End just before the target */ - if (blk == target) - break; + /* End just before the target */ + if (blk == target) + break; - /* Count every clause in the block */ - bi_foreach_clause_in_block(blk, clause) { - ret += bi_clause_quadwords(clause); - } - } - } else { - /* We start at the beginning of the clause but have to jump - * through the clauses before us in the block */ - bi_foreach_clause_in_block_from_rev(start->block, clause, start) { - if (clause == start) - continue; + /* Count every clause in the block */ + bi_foreach_clause_in_block(blk, clause) { + ret += bi_clause_quadwords(clause); + } + } + } else { + /* We start at the beginning of the clause but have to jump + * through the clauses before us in the block */ + bi_foreach_clause_in_block_from_rev(start->block, clause, start) { + if (clause == start) + continue; - ret -= bi_clause_quadwords(clause); - } + ret -= bi_clause_quadwords(clause); + } - /* And jump back every clause of preceding blocks up through - * and including the target to get to the beginning of the - * target */ - bi_foreach_block_from_rev(ctx, start->block, blk) { - if (blk == start->block) - continue; + /* And jump back every clause of preceding blocks up through + * and including the target to get to the beginning of the + * target */ + bi_foreach_block_from_rev(ctx, start->block, blk) { + if (blk == start->block) + continue; - bi_foreach_clause_in_block(blk, clause) { - ret -= bi_clause_quadwords(clause); - } + bi_foreach_clause_in_block(blk, clause) { + ret -= bi_clause_quadwords(clause); + } - /* End just after the target */ - if (blk == target) - break; - } - } + /* End just after the target */ + if (blk == target) + break; + } + } - return ret; + return ret; } diff --git a/src/panfrost/bifrost/bi_liveness.c b/src/panfrost/bifrost/bi_liveness.c index 1dc759e5911..52e0450877b 100644 --- a/src/panfrost/bifrost/bi_liveness.c +++ b/src/panfrost/bifrost/bi_liveness.c @@ -23,98 +23,100 @@ * SOFTWARE. */ -#include "compiler.h" #include "util/u_memory.h" +#include "compiler.h" void bi_liveness_ins_update_ssa(BITSET_WORD *live, const bi_instr *I) { - bi_foreach_dest(I, d) - BITSET_CLEAR(live, I->dest[d].value); + bi_foreach_dest(I, d) + BITSET_CLEAR(live, I->dest[d].value); - bi_foreach_ssa_src(I, s) - BITSET_SET(live, I->src[s].value); + bi_foreach_ssa_src(I, s) + BITSET_SET(live, I->src[s].value); } void bi_compute_liveness_ssa(bi_context *ctx) { - u_worklist worklist; - u_worklist_init(&worklist, ctx->num_blocks, NULL); + u_worklist worklist; + u_worklist_init(&worklist, ctx->num_blocks, NULL); - /* Free any previous liveness, and allocate */ - unsigned words = BITSET_WORDS(ctx->ssa_alloc); + /* Free any previous liveness, and allocate */ + unsigned words = BITSET_WORDS(ctx->ssa_alloc); - bi_foreach_block(ctx, block) { - if (block->ssa_live_in) - ralloc_free(block->ssa_live_in); + bi_foreach_block(ctx, block) { + if (block->ssa_live_in) + ralloc_free(block->ssa_live_in); - if (block->ssa_live_out) - ralloc_free(block->ssa_live_out); + if (block->ssa_live_out) + ralloc_free(block->ssa_live_out); - block->ssa_live_in = rzalloc_array(block, BITSET_WORD, words); - block->ssa_live_out = rzalloc_array(block, BITSET_WORD, words); + block->ssa_live_in = rzalloc_array(block, BITSET_WORD, words); + block->ssa_live_out = rzalloc_array(block, BITSET_WORD, words); - bi_worklist_push_head(&worklist, block); - } + bi_worklist_push_head(&worklist, block); + } - /* Iterate the work list */ - while(!u_worklist_is_empty(&worklist)) { - /* Pop in reverse order since liveness is a backwards pass */ - bi_block *blk = bi_worklist_pop_head(&worklist); + /* Iterate the work list */ + while (!u_worklist_is_empty(&worklist)) { + /* Pop in reverse order since liveness is a backwards pass */ + bi_block *blk = bi_worklist_pop_head(&worklist); - /* Update its liveness information */ - memcpy(blk->ssa_live_in, blk->ssa_live_out, words * sizeof(BITSET_WORD)); + /* Update its liveness information */ + memcpy(blk->ssa_live_in, blk->ssa_live_out, words * sizeof(BITSET_WORD)); - bi_foreach_instr_in_block_rev(blk, I) { - /* Phi nodes are handled separately, so we skip them. As phi nodes are - * at the beginning and we're iterating backwards, we stop as soon as - * we hit a phi node. - */ - if (I->op == BI_OPCODE_PHI) - break; + bi_foreach_instr_in_block_rev(blk, I) { + /* Phi nodes are handled separately, so we skip them. As phi nodes are + * at the beginning and we're iterating backwards, we stop as soon as + * we hit a phi node. + */ + if (I->op == BI_OPCODE_PHI) + break; - bi_liveness_ins_update_ssa(blk->ssa_live_in, I); - } + bi_liveness_ins_update_ssa(blk->ssa_live_in, I); + } - /* Propagate the live in of the successor (blk) to the live out of - * predecessors. - * - * Phi nodes are logically on the control flow edge and act in parallel. - * To handle when propagating, we kill writes from phis and make live the - * corresponding sources. - */ - bi_foreach_predecessor(blk, pred) { - BITSET_WORD *live = ralloc_array(blk, BITSET_WORD, words); - memcpy(live, blk->ssa_live_in, words * sizeof(BITSET_WORD)); + /* Propagate the live in of the successor (blk) to the live out of + * predecessors. + * + * Phi nodes are logically on the control flow edge and act in parallel. + * To handle when propagating, we kill writes from phis and make live the + * corresponding sources. + */ + bi_foreach_predecessor(blk, pred) { + BITSET_WORD *live = ralloc_array(blk, BITSET_WORD, words); + memcpy(live, blk->ssa_live_in, words * sizeof(BITSET_WORD)); - /* Kill write */ - bi_foreach_instr_in_block(blk, I) { - if (I->op != BI_OPCODE_PHI) break; + /* Kill write */ + bi_foreach_instr_in_block(blk, I) { + if (I->op != BI_OPCODE_PHI) + break; - BITSET_CLEAR(live, I->dest[0].value); - } + BITSET_CLEAR(live, I->dest[0].value); + } - /* Make live the corresponding source */ - bi_foreach_instr_in_block(blk, I) { - if (I->op != BI_OPCODE_PHI) break; + /* Make live the corresponding source */ + bi_foreach_instr_in_block(blk, I) { + if (I->op != BI_OPCODE_PHI) + break; - bi_index operand = I->src[bi_predecessor_index(blk, *pred)]; - if (bi_is_ssa(operand)) - BITSET_SET(live, operand.value); - } + bi_index operand = I->src[bi_predecessor_index(blk, *pred)]; + if (bi_is_ssa(operand)) + BITSET_SET(live, operand.value); + } - BITSET_WORD progress = 0; + BITSET_WORD progress = 0; - for (unsigned i = 0; i < words; ++i) { - progress |= live[i] & ~((*pred)->ssa_live_out[i]); - (*pred)->ssa_live_out[i] |= live[i]; - } + for (unsigned i = 0; i < words; ++i) { + progress |= live[i] & ~((*pred)->ssa_live_out[i]); + (*pred)->ssa_live_out[i] |= live[i]; + } - if (progress != 0) - bi_worklist_push_tail(&worklist, *pred); - } - } + if (progress != 0) + bi_worklist_push_tail(&worklist, *pred); + } + } - u_worklist_fini(&worklist); + u_worklist_fini(&worklist); } diff --git a/src/panfrost/bifrost/bi_lower_divergent_indirects.c b/src/panfrost/bifrost/bi_lower_divergent_indirects.c index 1b52040608d..e8453baaa79 100644 --- a/src/panfrost/bifrost/bi_lower_divergent_indirects.c +++ b/src/panfrost/bifrost/bi_lower_divergent_indirects.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "compiler/nir/nir_builder.h" +#include "compiler.h" /* Divergent attribute access is undefined behaviour. To avoid divergence, * lower to an if-chain like: @@ -40,89 +40,88 @@ static bool bi_lower_divergent_indirects_impl(nir_builder *b, nir_instr *instr, void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - gl_shader_stage stage = b->shader->info.stage; - nir_src *offset; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + gl_shader_stage stage = b->shader->info.stage; + nir_src *offset; - /* Not all indirect access needs this workaround */ - switch (intr->intrinsic) { - case nir_intrinsic_load_input: - case nir_intrinsic_load_interpolated_input: - /* Attributes and varyings */ - offset = nir_get_io_offset_src(intr); - break; + /* Not all indirect access needs this workaround */ + switch (intr->intrinsic) { + case nir_intrinsic_load_input: + case nir_intrinsic_load_interpolated_input: + /* Attributes and varyings */ + offset = nir_get_io_offset_src(intr); + break; - case nir_intrinsic_store_output: - /* Varyings only */ - if (stage == MESA_SHADER_FRAGMENT) - return false; + case nir_intrinsic_store_output: + /* Varyings only */ + if (stage == MESA_SHADER_FRAGMENT) + return false; - offset = nir_get_io_offset_src(intr); - break; + offset = nir_get_io_offset_src(intr); + break; - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_image_load: - case nir_intrinsic_image_store: - /* Any image access */ - offset = &intr->src[0]; - break; - default: - return false; - } + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + /* Any image access */ + offset = &intr->src[0]; + break; + default: + return false; + } - if (!nir_src_is_divergent(*offset)) - return false; + if (!nir_src_is_divergent(*offset)) + return false; - /* This indirect does need it */ + /* This indirect does need it */ - b->cursor = nir_before_instr(instr); - nir_ssa_def *lane = nir_load_subgroup_invocation(b); - unsigned *lanes = data; + b->cursor = nir_before_instr(instr); + nir_ssa_def *lane = nir_load_subgroup_invocation(b); + unsigned *lanes = data; - /* Write zero in a funny way to bypass lower_load_const_to_scalar */ - bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest; - unsigned size = has_dest ? nir_dest_bit_size(intr->dest) : 32; - nir_ssa_def *zero = has_dest ? nir_imm_zero(b, 1, size) : NULL; - nir_ssa_def *zeroes[4] = { zero, zero, zero, zero }; - nir_ssa_def *res = has_dest ? - nir_vec(b, zeroes, nir_dest_num_components(intr->dest)) : NULL; + /* Write zero in a funny way to bypass lower_load_const_to_scalar */ + bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest; + unsigned size = has_dest ? nir_dest_bit_size(intr->dest) : 32; + nir_ssa_def *zero = has_dest ? nir_imm_zero(b, 1, size) : NULL; + nir_ssa_def *zeroes[4] = {zero, zero, zero, zero}; + nir_ssa_def *res = + has_dest ? nir_vec(b, zeroes, nir_dest_num_components(intr->dest)) : NULL; - for (unsigned i = 0; i < (*lanes); ++i) { - nir_push_if(b, nir_ieq_imm(b, lane, i)); + for (unsigned i = 0; i < (*lanes); ++i) { + nir_push_if(b, nir_ieq_imm(b, lane, i)); - nir_instr *c = nir_instr_clone(b->shader, instr); - nir_intrinsic_instr *c_intr = nir_instr_as_intrinsic(c); - nir_builder_instr_insert(b, c); - nir_pop_if(b, NULL); + nir_instr *c = nir_instr_clone(b->shader, instr); + nir_intrinsic_instr *c_intr = nir_instr_as_intrinsic(c); + nir_builder_instr_insert(b, c); + nir_pop_if(b, NULL); - if (has_dest) { - assert(c_intr->dest.is_ssa); - nir_ssa_def *c_ssa = &c_intr->dest.ssa; - res = nir_if_phi(b, c_ssa, res); - } - } + if (has_dest) { + assert(c_intr->dest.is_ssa); + nir_ssa_def *c_ssa = &c_intr->dest.ssa; + res = nir_if_phi(b, c_ssa, res); + } + } - if (has_dest) - nir_ssa_def_rewrite_uses(&intr->dest.ssa, res); + if (has_dest) + nir_ssa_def_rewrite_uses(&intr->dest.ssa, res); - nir_instr_remove(instr); - return true; + nir_instr_remove(instr); + return true; } bool bi_lower_divergent_indirects(nir_shader *shader, unsigned lanes) { - return nir_shader_instructions_pass(shader, - bi_lower_divergent_indirects_impl, - nir_metadata_none, &lanes); + return nir_shader_instructions_pass( + shader, bi_lower_divergent_indirects_impl, nir_metadata_none, &lanes); } diff --git a/src/panfrost/bifrost/bi_lower_swizzle.c b/src/panfrost/bifrost/bi_lower_swizzle.c index 988618036b3..5cc6ca2a21f 100644 --- a/src/panfrost/bifrost/bi_lower_swizzle.c +++ b/src/panfrost/bifrost/bi_lower_swizzle.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" /* Not all 8-bit and 16-bit instructions support all swizzles on all sources. * These passes, intended to run after NIR->BIR but before scheduling/RA, lower @@ -33,270 +33,269 @@ static bool bi_swizzle_replicates_8(enum bi_swizzle swz) { - switch (swz) { - case BI_SWIZZLE_B0000: - case BI_SWIZZLE_B1111: - case BI_SWIZZLE_B2222: - case BI_SWIZZLE_B3333: - return true; - default: - return false; - } + switch (swz) { + case BI_SWIZZLE_B0000: + case BI_SWIZZLE_B1111: + case BI_SWIZZLE_B2222: + case BI_SWIZZLE_B3333: + return true; + default: + return false; + } } static void lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src) { - /* TODO: Use the opcode table and be a lot more methodical about this... */ - switch (ins->op) { - /* Some instructions used with 16-bit data never have swizzles */ - case BI_OPCODE_CSEL_V2F16: - case BI_OPCODE_CSEL_V2I16: - case BI_OPCODE_CSEL_V2S16: - case BI_OPCODE_CSEL_V2U16: + /* TODO: Use the opcode table and be a lot more methodical about this... */ + switch (ins->op) { + /* Some instructions used with 16-bit data never have swizzles */ + case BI_OPCODE_CSEL_V2F16: + case BI_OPCODE_CSEL_V2I16: + case BI_OPCODE_CSEL_V2S16: + case BI_OPCODE_CSEL_V2U16: - /* Despite ostensibly being 32-bit instructions, CLPER does not - * inherently interpret the data, so it can be used for v2f16 - * derivatives, which might require swizzle lowering */ - case BI_OPCODE_CLPER_I32: - case BI_OPCODE_CLPER_OLD_I32: + /* Despite ostensibly being 32-bit instructions, CLPER does not + * inherently interpret the data, so it can be used for v2f16 + * derivatives, which might require swizzle lowering */ + case BI_OPCODE_CLPER_I32: + case BI_OPCODE_CLPER_OLD_I32: - /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the - * boolean is implemented as a 16-bit integer, the swizzle is needed - * for correct operation if the instruction producing the 16-bit - * boolean does not replicate to both halves of the containing 32-bit - * register. As such, we may need to lower a swizzle. - * - * This is a silly hack. Ideally, code gen would be smart enough to - * avoid this case (by replicating). In practice, silly hardware design - * decisions force our hand here. - */ - case BI_OPCODE_MUX_I32: - case BI_OPCODE_CSEL_I32: - break; + /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the + * boolean is implemented as a 16-bit integer, the swizzle is needed + * for correct operation if the instruction producing the 16-bit + * boolean does not replicate to both halves of the containing 32-bit + * register. As such, we may need to lower a swizzle. + * + * This is a silly hack. Ideally, code gen would be smart enough to + * avoid this case (by replicating). In practice, silly hardware design + * decisions force our hand here. + */ + case BI_OPCODE_MUX_I32: + case BI_OPCODE_CSEL_I32: + break; - case BI_OPCODE_IADD_V2S16: - case BI_OPCODE_IADD_V2U16: - case BI_OPCODE_ISUB_V2S16: - case BI_OPCODE_ISUB_V2U16: - if (src == 0 && ins->src[src].swizzle != BI_SWIZZLE_H10) - break; - else - return; - case BI_OPCODE_LSHIFT_AND_V2I16: - case BI_OPCODE_LSHIFT_OR_V2I16: - case BI_OPCODE_LSHIFT_XOR_V2I16: - case BI_OPCODE_RSHIFT_AND_V2I16: - case BI_OPCODE_RSHIFT_OR_V2I16: - case BI_OPCODE_RSHIFT_XOR_V2I16: - if (src == 2) - return; - else - break; + case BI_OPCODE_IADD_V2S16: + case BI_OPCODE_IADD_V2U16: + case BI_OPCODE_ISUB_V2S16: + case BI_OPCODE_ISUB_V2U16: + if (src == 0 && ins->src[src].swizzle != BI_SWIZZLE_H10) + break; + else + return; + case BI_OPCODE_LSHIFT_AND_V2I16: + case BI_OPCODE_LSHIFT_OR_V2I16: + case BI_OPCODE_LSHIFT_XOR_V2I16: + case BI_OPCODE_RSHIFT_AND_V2I16: + case BI_OPCODE_RSHIFT_OR_V2I16: + case BI_OPCODE_RSHIFT_XOR_V2I16: + if (src == 2) + return; + else + break; - /* For some reason MUX.v2i16 allows swaps but not replication */ - case BI_OPCODE_MUX_V2I16: - if (ins->src[src].swizzle == BI_SWIZZLE_H10) - return; - else - break; + /* For some reason MUX.v2i16 allows swaps but not replication */ + case BI_OPCODE_MUX_V2I16: + if (ins->src[src].swizzle == BI_SWIZZLE_H10) + return; + else + break; - /* No swizzles supported */ - case BI_OPCODE_HADD_V4U8: - case BI_OPCODE_HADD_V4S8: - case BI_OPCODE_CLZ_V4U8: - case BI_OPCODE_IDP_V4I8: - case BI_OPCODE_IABS_V4S8: - case BI_OPCODE_ICMP_V4I8: - case BI_OPCODE_ICMP_V4U8: - case BI_OPCODE_MUX_V4I8: - case BI_OPCODE_IADD_IMM_V4I8: - break; + /* No swizzles supported */ + case BI_OPCODE_HADD_V4U8: + case BI_OPCODE_HADD_V4S8: + case BI_OPCODE_CLZ_V4U8: + case BI_OPCODE_IDP_V4I8: + case BI_OPCODE_IABS_V4S8: + case BI_OPCODE_ICMP_V4I8: + case BI_OPCODE_ICMP_V4U8: + case BI_OPCODE_MUX_V4I8: + case BI_OPCODE_IADD_IMM_V4I8: + break; - case BI_OPCODE_LSHIFT_AND_V4I8: - case BI_OPCODE_LSHIFT_OR_V4I8: - case BI_OPCODE_LSHIFT_XOR_V4I8: - case BI_OPCODE_RSHIFT_AND_V4I8: - case BI_OPCODE_RSHIFT_OR_V4I8: - case BI_OPCODE_RSHIFT_XOR_V4I8: - /* Last source allows identity or replication */ - if (src == 2 && bi_swizzle_replicates_8(ins->src[src].swizzle)) - return; + case BI_OPCODE_LSHIFT_AND_V4I8: + case BI_OPCODE_LSHIFT_OR_V4I8: + case BI_OPCODE_LSHIFT_XOR_V4I8: + case BI_OPCODE_RSHIFT_AND_V4I8: + case BI_OPCODE_RSHIFT_OR_V4I8: + case BI_OPCODE_RSHIFT_XOR_V4I8: + /* Last source allows identity or replication */ + if (src == 2 && bi_swizzle_replicates_8(ins->src[src].swizzle)) + return; - /* Others do not allow swizzles */ - break; + /* Others do not allow swizzles */ + break; - /* We don't want to deal with reswizzling logic in modifier prop. Move - * the swizzle outside, it's easier for clamp propagation. */ - case BI_OPCODE_FCLAMP_V2F16: - { - bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); - bi_index dest = ins->dest[0]; - bi_index tmp = bi_temp(ctx); + /* We don't want to deal with reswizzling logic in modifier prop. Move + * the swizzle outside, it's easier for clamp propagation. */ + case BI_OPCODE_FCLAMP_V2F16: { + bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); + bi_index dest = ins->dest[0]; + bi_index tmp = bi_temp(ctx); - ins->dest[0] = tmp; - bi_swz_v2i16_to(&b, dest, bi_replace_index(ins->src[0], tmp)); - return; - } + ins->dest[0] = tmp; + bi_swz_v2i16_to(&b, dest, bi_replace_index(ins->src[0], tmp)); + return; + } - default: - return; - } + default: + return; + } - /* First, try to apply a given swizzle to a constant to clear the - * runtime swizzle. This is less heavy-handed than ignoring the - * swizzle for scalar destinations, since it maintains - * replication of the destination. - */ - if (ins->src[src].type == BI_INDEX_CONSTANT) { - ins->src[src].value = bi_apply_swizzle(ins->src[src].value, - ins->src[src].swizzle); - ins->src[src].swizzle = BI_SWIZZLE_H01; - return; - } + /* First, try to apply a given swizzle to a constant to clear the + * runtime swizzle. This is less heavy-handed than ignoring the + * swizzle for scalar destinations, since it maintains + * replication of the destination. + */ + if (ins->src[src].type == BI_INDEX_CONSTANT) { + ins->src[src].value = + bi_apply_swizzle(ins->src[src].value, ins->src[src].swizzle); + ins->src[src].swizzle = BI_SWIZZLE_H01; + return; + } - /* Even if the source does not replicate, if the consuming instruction - * produces a 16-bit scalar, we can ignore the other component. - */ - if (ins->dest[0].swizzle == BI_SWIZZLE_H00 && - ins->src[src].swizzle == BI_SWIZZLE_H00) - { - ins->src[src].swizzle = BI_SWIZZLE_H01; - return; - } + /* Even if the source does not replicate, if the consuming instruction + * produces a 16-bit scalar, we can ignore the other component. + */ + if (ins->dest[0].swizzle == BI_SWIZZLE_H00 && + ins->src[src].swizzle == BI_SWIZZLE_H00) { + ins->src[src].swizzle = BI_SWIZZLE_H01; + return; + } - /* Lower it away */ - bi_builder b = bi_init_builder(ctx, bi_before_instr(ins)); + /* Lower it away */ + bi_builder b = bi_init_builder(ctx, bi_before_instr(ins)); - bool is_8 = (bi_opcode_props[ins->op].size == BI_SIZE_8); - bi_index orig = ins->src[src]; - bi_index stripped = bi_replace_index(bi_null(), orig); - stripped.swizzle = ins->src[src].swizzle; + bool is_8 = (bi_opcode_props[ins->op].size == BI_SIZE_8); + bi_index orig = ins->src[src]; + bi_index stripped = bi_replace_index(bi_null(), orig); + stripped.swizzle = ins->src[src].swizzle; - bi_index swz = is_8 ? bi_swz_v4i8(&b, stripped) : bi_swz_v2i16(&b, stripped); + bi_index swz = is_8 ? bi_swz_v4i8(&b, stripped) : bi_swz_v2i16(&b, stripped); - bi_replace_src(ins, src, swz); - ins->src[src].swizzle = BI_SWIZZLE_H01; + bi_replace_src(ins, src, swz); + ins->src[src].swizzle = BI_SWIZZLE_H01; } static bool bi_swizzle_replicates_16(enum bi_swizzle swz) { - switch (swz) { - case BI_SWIZZLE_H00: - case BI_SWIZZLE_H11: - return true; - default: - /* If a swizzle replicates every 8-bits, it also replicates - * every 16-bits, so allow 8-bit replicating swizzles. - */ - return bi_swizzle_replicates_8(swz); - } + switch (swz) { + case BI_SWIZZLE_H00: + case BI_SWIZZLE_H11: + return true; + default: + /* If a swizzle replicates every 8-bits, it also replicates + * every 16-bits, so allow 8-bit replicating swizzles. + */ + return bi_swizzle_replicates_8(swz); + } } static bool bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16) { - switch (I->op) { + switch (I->op) { - /* Instructions that construct vectors have replicated output if their - * sources are identical. Check this case first. - */ - case BI_OPCODE_MKVEC_V2I16: - case BI_OPCODE_V2F16_TO_V2S16: - case BI_OPCODE_V2F16_TO_V2U16: - case BI_OPCODE_V2F32_TO_V2F16: - case BI_OPCODE_V2S16_TO_V2F16: - case BI_OPCODE_V2S8_TO_V2F16: - case BI_OPCODE_V2S8_TO_V2S16: - case BI_OPCODE_V2U16_TO_V2F16: - case BI_OPCODE_V2U8_TO_V2F16: - case BI_OPCODE_V2U8_TO_V2U16: - return bi_is_value_equiv(I->src[0], I->src[1]); + /* Instructions that construct vectors have replicated output if their + * sources are identical. Check this case first. + */ + case BI_OPCODE_MKVEC_V2I16: + case BI_OPCODE_V2F16_TO_V2S16: + case BI_OPCODE_V2F16_TO_V2U16: + case BI_OPCODE_V2F32_TO_V2F16: + case BI_OPCODE_V2S16_TO_V2F16: + case BI_OPCODE_V2S8_TO_V2F16: + case BI_OPCODE_V2S8_TO_V2S16: + case BI_OPCODE_V2U16_TO_V2F16: + case BI_OPCODE_V2U8_TO_V2F16: + case BI_OPCODE_V2U8_TO_V2U16: + return bi_is_value_equiv(I->src[0], I->src[1]); - /* 16-bit transcendentals are defined to output zero in their - * upper half, so they do not replicate - */ - case BI_OPCODE_FRCP_F16: - case BI_OPCODE_FRSQ_F16: - return false; + /* 16-bit transcendentals are defined to output zero in their + * upper half, so they do not replicate + */ + case BI_OPCODE_FRCP_F16: + case BI_OPCODE_FRSQ_F16: + return false; - /* Not sure, be conservative, we don't use these.. */ - case BI_OPCODE_VN_ASST1_F16: - case BI_OPCODE_FPCLASS_F16: - case BI_OPCODE_FPOW_SC_DET_F16: - return false; + /* Not sure, be conservative, we don't use these.. */ + case BI_OPCODE_VN_ASST1_F16: + case BI_OPCODE_FPCLASS_F16: + case BI_OPCODE_FPOW_SC_DET_F16: + return false; - default: - break; - } + default: + break; + } - /* Replication analysis only makes sense for ALU instructions */ - if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE) - return false; + /* Replication analysis only makes sense for ALU instructions */ + if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE) + return false; - /* We only analyze 16-bit instructions for 16-bit replication. We could - * maybe do better. - */ - if (bi_opcode_props[I->op].size != BI_SIZE_16) - return false; + /* We only analyze 16-bit instructions for 16-bit replication. We could + * maybe do better. + */ + if (bi_opcode_props[I->op].size != BI_SIZE_16) + return false; - bi_foreach_src(I, s) { - if (bi_is_null(I->src[s])) - continue; + bi_foreach_src(I, s) { + if (bi_is_null(I->src[s])) + continue; - /* Replicated swizzles */ - if (bi_swizzle_replicates_16(I->src[s].swizzle)) - continue; + /* Replicated swizzles */ + if (bi_swizzle_replicates_16(I->src[s].swizzle)) + continue; - /* Replicated values */ - if (bi_is_ssa(I->src[s]) && - BITSET_TEST(replicates_16, I->src[s].value)) - continue; + /* Replicated values */ + if (bi_is_ssa(I->src[s]) && BITSET_TEST(replicates_16, I->src[s].value)) + continue; - /* Replicated constants */ - if (I->src[s].type == BI_INDEX_CONSTANT && - (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16)) - continue; + /* Replicated constants */ + if (I->src[s].type == BI_INDEX_CONSTANT && + (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16)) + continue; - return false; - } + return false; + } - return true; + return true; } void bi_lower_swizzle(bi_context *ctx) { - bi_foreach_instr_global_safe(ctx, ins) { - bi_foreach_src(ins, s) { - if (bi_is_null(ins->src[s])) continue; - if (ins->src[s].swizzle == BI_SWIZZLE_H01) continue; + bi_foreach_instr_global_safe(ctx, ins) { + bi_foreach_src(ins, s) { + if (bi_is_null(ins->src[s])) + continue; + if (ins->src[s].swizzle == BI_SWIZZLE_H01) + continue; - lower_swizzle(ctx, ins, s); - } - } + lower_swizzle(ctx, ins, s); + } + } - /* Now that we've lowered swizzles, clean up the mess */ - BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc); + /* Now that we've lowered swizzles, clean up the mess */ + BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc); - bi_foreach_instr_global(ctx, ins) { - if (ins->nr_dests && bi_instr_replicates(ins, replicates_16)) - BITSET_SET(replicates_16, ins->dest[0].value); + bi_foreach_instr_global(ctx, ins) { + if (ins->nr_dests && bi_instr_replicates(ins, replicates_16)) + BITSET_SET(replicates_16, ins->dest[0].value); - if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) && - BITSET_TEST(replicates_16, ins->src[0].value)) { - ins->op = BI_OPCODE_MOV_I32; - ins->src[0].swizzle = BI_SWIZZLE_H01; - } + if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) && + BITSET_TEST(replicates_16, ins->src[0].value)) { + ins->op = BI_OPCODE_MOV_I32; + ins->src[0].swizzle = BI_SWIZZLE_H01; + } - /* The above passes rely on replicating destinations. For - * Valhall, we will want to optimize this. For now, default - * to Bifrost compatible behaviour. - */ - if (ins->nr_dests) - ins->dest[0].swizzle = BI_SWIZZLE_H01; - } + /* The above passes rely on replicating destinations. For + * Valhall, we will want to optimize this. For now, default + * to Bifrost compatible behaviour. + */ + if (ins->nr_dests) + ins->dest[0].swizzle = BI_SWIZZLE_H01; + } - free(replicates_16); + free(replicates_16); } diff --git a/src/panfrost/bifrost/bi_opt_constant_fold.c b/src/panfrost/bifrost/bi_opt_constant_fold.c index 03c370e8132..92776fdcbac 100644 --- a/src/panfrost/bifrost/bi_opt_constant_fold.c +++ b/src/panfrost/bifrost/bi_opt_constant_fold.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" /* Dead simple constant folding to cleanup compiler frontend patterns. Before * adding a new pattern here, check why you need it and whether we can avoid @@ -31,83 +31,84 @@ static inline uint32_t bi_source_value(const bi_instr *I, unsigned s) { - if (s < I->nr_srcs) - return bi_apply_swizzle(I->src[s].value, I->src[s].swizzle); - else - return 0; + if (s < I->nr_srcs) + return bi_apply_swizzle(I->src[s].value, I->src[s].swizzle); + else + return 0; } uint32_t bi_fold_constant(bi_instr *I, bool *unsupported) { - /* We can only fold instructions where all sources are constant */ - bi_foreach_src(I, s) { - if (I->src[s].type != BI_INDEX_CONSTANT) { - *unsupported = true; - return 0; - } - } + /* We can only fold instructions where all sources are constant */ + bi_foreach_src(I, s) { + if (I->src[s].type != BI_INDEX_CONSTANT) { + *unsupported = true; + return 0; + } + } - /* Grab the sources */ - uint32_t a = bi_source_value(I, 0); - uint32_t b = bi_source_value(I, 1); - uint32_t c = bi_source_value(I, 2); - uint32_t d = bi_source_value(I, 3); + /* Grab the sources */ + uint32_t a = bi_source_value(I, 0); + uint32_t b = bi_source_value(I, 1); + uint32_t c = bi_source_value(I, 2); + uint32_t d = bi_source_value(I, 3); - /* Evaluate the instruction */ - switch (I->op) { - case BI_OPCODE_SWZ_V2I16: - return a; + /* Evaluate the instruction */ + switch (I->op) { + case BI_OPCODE_SWZ_V2I16: + return a; - case BI_OPCODE_MKVEC_V2I16: - return (b << 16) | (a & 0xFFFF); + case BI_OPCODE_MKVEC_V2I16: + return (b << 16) | (a & 0xFFFF); - case BI_OPCODE_MKVEC_V4I8: - return (d << 24) | ((c & 0xFF) << 16) | ((b & 0xFF) << 8) | (a & 0xFF); + case BI_OPCODE_MKVEC_V4I8: + return (d << 24) | ((c & 0xFF) << 16) | ((b & 0xFF) << 8) | (a & 0xFF); - case BI_OPCODE_MKVEC_V2I8: - return (c << 16) | ((b & 0xFF) << 8) | (a & 0xFF); + case BI_OPCODE_MKVEC_V2I8: + return (c << 16) | ((b & 0xFF) << 8) | (a & 0xFF); - case BI_OPCODE_LSHIFT_OR_I32: - if (I->not_result || I->src[0].neg || I->src[1].neg) - break; + case BI_OPCODE_LSHIFT_OR_I32: + if (I->not_result || I->src[0].neg || I->src[1].neg) + break; - return (a << c) | b; + return (a << c) | b; - case BI_OPCODE_F32_TO_U32: - if (I->round == BI_ROUND_NONE) { - /* Explicitly clamp to prevent undefined behaviour and - * match hardware rules */ - float f = uif(a); - return (f >= 0.0) ? (uint32_t) f : 0; - } else - break; + case BI_OPCODE_F32_TO_U32: + if (I->round == BI_ROUND_NONE) { + /* Explicitly clamp to prevent undefined behaviour and + * match hardware rules */ + float f = uif(a); + return (f >= 0.0) ? (uint32_t)f : 0; + } else + break; - default: - break; - } + default: + break; + } - *unsupported = true; - return 0; + *unsupported = true; + return 0; } bool bi_opt_constant_fold(bi_context *ctx) { - bool progress = false; + bool progress = false; - bi_foreach_instr_global_safe(ctx, ins) { - bool unsupported = false; - uint32_t replace = bi_fold_constant(ins, &unsupported); - if (unsupported) continue; + bi_foreach_instr_global_safe(ctx, ins) { + bool unsupported = false; + uint32_t replace = bi_fold_constant(ins, &unsupported); + if (unsupported) + continue; - /* Replace with constant move, to be copypropped */ - assert(ins->nr_dests == 1); - bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); - bi_mov_i32_to(&b, ins->dest[0], bi_imm_u32(replace)); - bi_remove_instruction(ins); - progress = true; - } + /* Replace with constant move, to be copypropped */ + assert(ins->nr_dests == 1); + bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); + bi_mov_i32_to(&b, ins->dest[0], bi_imm_u32(replace)); + bi_remove_instruction(ins); + progress = true; + } - return progress; + return progress; } diff --git a/src/panfrost/bifrost/bi_opt_copy_prop.c b/src/panfrost/bifrost/bi_opt_copy_prop.c index 13b9b0d2b83..1a3bc5ae042 100644 --- a/src/panfrost/bifrost/bi_opt_copy_prop.c +++ b/src/panfrost/bifrost/bi_opt_copy_prop.c @@ -22,92 +22,95 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" /* SSA copy propagation */ static bool bi_reads_fau(bi_instr *ins) { - bi_foreach_src(ins, s) { - if (ins->src[s].type == BI_INDEX_FAU) - return true; - } + bi_foreach_src(ins, s) { + if (ins->src[s].type == BI_INDEX_FAU) + return true; + } - return false; + return false; } void bi_opt_copy_prop(bi_context *ctx) { - /* Chase SPLIT of COLLECT. Instruction selection usually avoids this - * pattern (due to the split cache), but it is inevitably generated by - * the UBO pushing pass. - */ - bi_instr **collects = calloc(sizeof(bi_instr *), ctx->ssa_alloc); - bi_foreach_instr_global_safe(ctx, I) { - if (I->op == BI_OPCODE_COLLECT_I32) { - /* Rewrite trivial collects while we're at it */ - if (I->nr_srcs == 1) - I->op = BI_OPCODE_MOV_I32; + /* Chase SPLIT of COLLECT. Instruction selection usually avoids this + * pattern (due to the split cache), but it is inevitably generated by + * the UBO pushing pass. + */ + bi_instr **collects = calloc(sizeof(bi_instr *), ctx->ssa_alloc); + bi_foreach_instr_global_safe(ctx, I) { + if (I->op == BI_OPCODE_COLLECT_I32) { + /* Rewrite trivial collects while we're at it */ + if (I->nr_srcs == 1) + I->op = BI_OPCODE_MOV_I32; - collects[I->dest[0].value] = I; - } else if (I->op == BI_OPCODE_SPLIT_I32) { - /* Rewrite trivial splits while we're at it */ - if (I->nr_dests == 1) - I->op = BI_OPCODE_MOV_I32; + collects[I->dest[0].value] = I; + } else if (I->op == BI_OPCODE_SPLIT_I32) { + /* Rewrite trivial splits while we're at it */ + if (I->nr_dests == 1) + I->op = BI_OPCODE_MOV_I32; - bi_instr *collect = collects[I->src[0].value]; - if (!collect) - continue; + bi_instr *collect = collects[I->src[0].value]; + if (!collect) + continue; - /* Lower the split to moves, copyprop cleans up */ - bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); + /* Lower the split to moves, copyprop cleans up */ + bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - bi_foreach_dest(I, d) - bi_mov_i32_to(&b, I->dest[d], collect->src[d]); + bi_foreach_dest(I, d) + bi_mov_i32_to(&b, I->dest[d], collect->src[d]); - bi_remove_instruction(I); - } - } + bi_remove_instruction(I); + } + } - free(collects); + free(collects); - bi_index *replacement = calloc(sizeof(bi_index), ctx->ssa_alloc); + bi_index *replacement = calloc(sizeof(bi_index), ctx->ssa_alloc); - bi_foreach_instr_global_safe(ctx, ins) { - if (ins->op == BI_OPCODE_MOV_I32 && ins->src[0].type != BI_INDEX_REGISTER) { - bi_index replace = ins->src[0]; + bi_foreach_instr_global_safe(ctx, ins) { + if (ins->op == BI_OPCODE_MOV_I32 && + ins->src[0].type != BI_INDEX_REGISTER) { + bi_index replace = ins->src[0]; - /* Peek through one layer so copyprop converges in one - * iteration for chained moves */ - if (bi_is_ssa(replace)) { - bi_index chained = replacement[replace.value]; + /* Peek through one layer so copyprop converges in one + * iteration for chained moves */ + if (bi_is_ssa(replace)) { + bi_index chained = replacement[replace.value]; - if (!bi_is_null(chained)) - replace = chained; - } + if (!bi_is_null(chained)) + replace = chained; + } - assert(ins->nr_dests == 1); - replacement[ins->dest[0].value] = replace; - } + assert(ins->nr_dests == 1); + replacement[ins->dest[0].value] = replace; + } - bi_foreach_src(ins, s) { - bi_index use = ins->src[s]; + bi_foreach_src(ins, s) { + bi_index use = ins->src[s]; - if (use.type != BI_INDEX_NORMAL) continue; - if (bi_is_staging_src(ins, s)) continue; + if (use.type != BI_INDEX_NORMAL) + continue; + if (bi_is_staging_src(ins, s)) + continue; - bi_index repl = replacement[use.value]; + bi_index repl = replacement[use.value]; - if (repl.type == BI_INDEX_CONSTANT && bi_reads_fau(ins)) - continue; + if (repl.type == BI_INDEX_CONSTANT && bi_reads_fau(ins)) + continue; - if (!bi_is_null(repl)) - bi_replace_src(ins, s, repl); - } - } + if (!bi_is_null(repl)) + bi_replace_src(ins, s, repl); + } + } - free(replacement); + free(replacement); } diff --git a/src/panfrost/bifrost/bi_opt_cse.c b/src/panfrost/bifrost/bi_opt_cse.c index 4ffc9475e5a..40ef1877ab1 100644 --- a/src/panfrost/bifrost/bi_opt_cse.c +++ b/src/panfrost/bifrost/bi_opt_cse.c @@ -22,8 +22,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" #define XXH_INLINE_ALL #include "util/xxhash.h" @@ -36,85 +36,88 @@ static inline uint32_t HASH(uint32_t hash, unsigned data) { - return XXH32(&data, sizeof(data), hash); + return XXH32(&data, sizeof(data), hash); } static uint32_t hash_index(uint32_t hash, bi_index index) { - hash = HASH(hash, index.value); - hash = HASH(hash, index.abs); - hash = HASH(hash, index.neg); - hash = HASH(hash, index.swizzle); - hash = HASH(hash, index.offset); - hash = HASH(hash, index.type); - return hash; + hash = HASH(hash, index.value); + hash = HASH(hash, index.abs); + hash = HASH(hash, index.neg); + hash = HASH(hash, index.swizzle); + hash = HASH(hash, index.offset); + hash = HASH(hash, index.type); + return hash; } /* Hash an ALU instruction. */ static uint32_t hash_instr(const void *data) { - const bi_instr *I = data; - uint32_t hash = 0; + const bi_instr *I = data; + uint32_t hash = 0; - hash = HASH(hash, I->op); - hash = HASH(hash, I->nr_dests); - hash = HASH(hash, I->nr_srcs); + hash = HASH(hash, I->op); + hash = HASH(hash, I->nr_dests); + hash = HASH(hash, I->nr_srcs); - assert(!I->flow && !I->slot && "CSE must be early"); + assert(!I->flow && !I->slot && "CSE must be early"); - /* Explcitly skip destinations, except for size details */ - bi_foreach_dest(I, d) { - hash = HASH(hash, I->dest[d].swizzle); - } + /* Explcitly skip destinations, except for size details */ + bi_foreach_dest(I, d) { + hash = HASH(hash, I->dest[d].swizzle); + } - bi_foreach_src(I, s) { - hash = hash_index(hash, I->src[s]); - } + bi_foreach_src(I, s) { + hash = hash_index(hash, I->src[s]); + } - /* Explicitly skip branch, regfmt, vecsize, no_spill, tdd, table */ - hash = HASH(hash, I->dest_mod); + /* Explicitly skip branch, regfmt, vecsize, no_spill, tdd, table */ + hash = HASH(hash, I->dest_mod); - /* Explicitly skip other immediates */ - hash = HASH(hash, I->shift); + /* Explicitly skip other immediates */ + hash = HASH(hash, I->shift); - for (unsigned i = 0; i < ARRAY_SIZE(I->flags); ++i) - hash = HASH(hash, I->flags[i]); + for (unsigned i = 0; i < ARRAY_SIZE(I->flags); ++i) + hash = HASH(hash, I->flags[i]); - return hash; + return hash; } static bool instrs_equal(const void *_i1, const void *_i2) { - const bi_instr *i1 = _i1, *i2 = _i2; + const bi_instr *i1 = _i1, *i2 = _i2; - if (i1->op != i2->op) return false; - if (i1->nr_srcs != i2->nr_srcs) return false; - if (i1->nr_dests != i2->nr_dests) return false; + if (i1->op != i2->op) + return false; + if (i1->nr_srcs != i2->nr_srcs) + return false; + if (i1->nr_dests != i2->nr_dests) + return false; - /* Explicitly skip destinations */ + /* Explicitly skip destinations */ - bi_foreach_src(i1, s) { - bi_index s1 = i1->src[s], s2 = i2->src[s]; + bi_foreach_src(i1, s) { + bi_index s1 = i1->src[s], s2 = i2->src[s]; - if (memcmp(&s1, &s2, sizeof(s1)) != 0) - return false; - } + if (memcmp(&s1, &s2, sizeof(s1)) != 0) + return false; + } - if (i1->dest_mod != i2->dest_mod) - return false; + if (i1->dest_mod != i2->dest_mod) + return false; - if (i1->shift != i2->shift) - return false; + if (i1->shift != i2->shift) + return false; - for (unsigned i = 0; i < ARRAY_SIZE(i1->flags); ++i) { - if (i1->flags[i] != i2->flags[i]) - return false; - } + for (unsigned i = 0; i < ARRAY_SIZE(i1->flags); ++i) { + if (i1->flags[i] != i2->flags[i]) + return false; + } - return true; + return true; } /* Determines what instructions the above routines have to handle */ @@ -122,64 +125,64 @@ instrs_equal(const void *_i1, const void *_i2) static bool instr_can_cse(const bi_instr *I) { - switch (I->op) { - case BI_OPCODE_DTSEL_IMM: - case BI_OPCODE_DISCARD_F32: - return false; - default: - break; - } + switch (I->op) { + case BI_OPCODE_DTSEL_IMM: + case BI_OPCODE_DISCARD_F32: + return false; + default: + break; + } - /* Be conservative about which message-passing instructions we CSE, - * since most are not pure even within a thread. - */ - if (bi_opcode_props[I->op].message && I->op != BI_OPCODE_LEA_BUF_IMM) - return false; + /* Be conservative about which message-passing instructions we CSE, + * since most are not pure even within a thread. + */ + if (bi_opcode_props[I->op].message && I->op != BI_OPCODE_LEA_BUF_IMM) + return false; - if (I->branch_target) - return false; + if (I->branch_target) + return false; - return true; + return true; } void bi_opt_cse(bi_context *ctx) { - struct set *instr_set = _mesa_set_create(NULL, hash_instr, instrs_equal); + struct set *instr_set = _mesa_set_create(NULL, hash_instr, instrs_equal); - bi_foreach_block(ctx, block) { - bi_index *replacement = calloc(sizeof(bi_index), ctx->ssa_alloc); - _mesa_set_clear(instr_set, NULL); + bi_foreach_block(ctx, block) { + bi_index *replacement = calloc(sizeof(bi_index), ctx->ssa_alloc); + _mesa_set_clear(instr_set, NULL); - bi_foreach_instr_in_block(block, instr) { - /* Rewrite before trying to CSE anything so we converge - * locally in one iteration */ - bi_foreach_ssa_src(instr, s) { - if (bi_is_staging_src(instr, s)) - continue; + bi_foreach_instr_in_block(block, instr) { + /* Rewrite before trying to CSE anything so we converge + * locally in one iteration */ + bi_foreach_ssa_src(instr, s) { + if (bi_is_staging_src(instr, s)) + continue; - bi_index repl = replacement[instr->src[s].value]; - if (!bi_is_null(repl)) - bi_replace_src(instr, s, repl); - } + bi_index repl = replacement[instr->src[s].value]; + if (!bi_is_null(repl)) + bi_replace_src(instr, s, repl); + } - if (!instr_can_cse(instr)) - continue; + if (!instr_can_cse(instr)) + continue; - bool found; - struct set_entry *entry = - _mesa_set_search_or_add(instr_set, instr, &found); - if (found) { - const bi_instr *match = entry->key; + bool found; + struct set_entry *entry = + _mesa_set_search_or_add(instr_set, instr, &found); + if (found) { + const bi_instr *match = entry->key; - bi_foreach_dest(instr, d) { - replacement[instr->dest[d].value] = match->dest[d]; - } - } - } + bi_foreach_dest(instr, d) { + replacement[instr->dest[d].value] = match->dest[d]; + } + } + } - free(replacement); - } + free(replacement); + } - _mesa_set_destroy(instr_set, NULL); + _mesa_set_destroy(instr_set, NULL); } diff --git a/src/panfrost/bifrost/bi_opt_dce.c b/src/panfrost/bifrost/bi_opt_dce.c index d9668f207a9..e8e12dd9525 100644 --- a/src/panfrost/bifrost/bi_opt_dce.c +++ b/src/panfrost/bifrost/bi_opt_dce.c @@ -22,66 +22,67 @@ * SOFTWARE. */ -#include "compiler.h" #include "util/u_memory.h" +#include "compiler.h" /* A simple SSA-based mark-and-sweep dead code elimination pass. */ void bi_opt_dead_code_eliminate(bi_context *ctx) { - /* Mark live values */ - BITSET_WORD *mark = calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc)); + /* Mark live values */ + BITSET_WORD *mark = + calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc)); - u_worklist worklist; - u_worklist_init(&worklist, ctx->num_blocks, NULL); + u_worklist worklist; + u_worklist_init(&worklist, ctx->num_blocks, NULL); - bi_foreach_block(ctx, block) { - bi_worklist_push_head(&worklist, block); - } + bi_foreach_block(ctx, block) { + bi_worklist_push_head(&worklist, block); + } - while(!u_worklist_is_empty(&worklist)) { - /* Pop in reverse order for backwards pass */ - bi_block *blk = bi_worklist_pop_head(&worklist); + while (!u_worklist_is_empty(&worklist)) { + /* Pop in reverse order for backwards pass */ + bi_block *blk = bi_worklist_pop_head(&worklist); - bool progress = false; + bool progress = false; - bi_foreach_instr_in_block_rev(blk, I) { - bool needed = bi_side_effects(I); + bi_foreach_instr_in_block_rev(blk, I) { + bool needed = bi_side_effects(I); - bi_foreach_dest(I, d) - needed |= BITSET_TEST(mark, I->dest[d].value); + bi_foreach_dest(I, d) + needed |= BITSET_TEST(mark, I->dest[d].value); - if (!needed) - continue; + if (!needed) + continue; - bi_foreach_ssa_src(I, s) { - progress |= !BITSET_TEST(mark, I->src[s].value); - BITSET_SET(mark, I->src[s].value); - } - } + bi_foreach_ssa_src(I, s) { + progress |= !BITSET_TEST(mark, I->src[s].value); + BITSET_SET(mark, I->src[s].value); + } + } - /* XXX: slow */ - if (progress) { - bi_foreach_block(ctx, block) - bi_worklist_push_head(&worklist, block); - } - } + /* XXX: slow */ + if (progress) { + bi_foreach_block(ctx, block) + bi_worklist_push_head(&worklist, block); + } + } - u_worklist_fini(&worklist); + u_worklist_fini(&worklist); - /* Sweep */ - bi_foreach_instr_global_safe(ctx, I) { - bool needed = bi_side_effects(I); + /* Sweep */ + bi_foreach_instr_global_safe(ctx, I) { + bool needed = bi_side_effects(I); - bi_foreach_dest(I, d) - needed |= BITSET_TEST(mark, I->dest[d].value); + bi_foreach_dest(I, d) + needed |= BITSET_TEST(mark, I->dest[d].value); - if (!needed) - bi_remove_instruction(I); - } + if (!needed) + bi_remove_instruction(I); + } - free(mark); + free(mark); } /* Post-RA liveness-based dead code analysis to clean up results of bundling */ @@ -89,39 +90,39 @@ bi_opt_dead_code_eliminate(bi_context *ctx) uint64_t MUST_CHECK bi_postra_liveness_ins(uint64_t live, bi_instr *ins) { - bi_foreach_dest(ins, d) { - if (ins->dest[d].type == BI_INDEX_REGISTER) { - unsigned nr = bi_count_write_registers(ins, d); - unsigned reg = ins->dest[d].value; - live &= ~(BITFIELD64_MASK(nr) << reg); - } - } + bi_foreach_dest(ins, d) { + if (ins->dest[d].type == BI_INDEX_REGISTER) { + unsigned nr = bi_count_write_registers(ins, d); + unsigned reg = ins->dest[d].value; + live &= ~(BITFIELD64_MASK(nr) << reg); + } + } - bi_foreach_src(ins, s) { - if (ins->src[s].type == BI_INDEX_REGISTER) { - unsigned nr = bi_count_read_registers(ins, s); - unsigned reg = ins->src[s].value; - live |= (BITFIELD64_MASK(nr) << reg); - } - } + bi_foreach_src(ins, s) { + if (ins->src[s].type == BI_INDEX_REGISTER) { + unsigned nr = bi_count_read_registers(ins, s); + unsigned reg = ins->src[s].value; + live |= (BITFIELD64_MASK(nr) << reg); + } + } - return live; + return live; } static bool bi_postra_liveness_block(bi_block *blk) { - bi_foreach_successor(blk, succ) - blk->reg_live_out |= succ->reg_live_in; + bi_foreach_successor(blk, succ) + blk->reg_live_out |= succ->reg_live_in; - uint64_t live = blk->reg_live_out; + uint64_t live = blk->reg_live_out; - bi_foreach_instr_in_block_rev(blk, ins) - live = bi_postra_liveness_ins(live, ins); + bi_foreach_instr_in_block_rev(blk, ins) + live = bi_postra_liveness_ins(live, ins); - bool progress = blk->reg_live_in != live; - blk->reg_live_in = live; - return progress; + bool progress = blk->reg_live_in != live; + blk->reg_live_in = live; + return progress; } /* Globally, liveness analysis uses a fixed-point algorithm based on a @@ -133,58 +134,58 @@ bi_postra_liveness_block(bi_block *blk) void bi_postra_liveness(bi_context *ctx) { - u_worklist worklist; - bi_worklist_init(ctx, &worklist); + u_worklist worklist; + bi_worklist_init(ctx, &worklist); - bi_foreach_block(ctx, block) { - block->reg_live_out = block->reg_live_in = 0; + bi_foreach_block(ctx, block) { + block->reg_live_out = block->reg_live_in = 0; - bi_worklist_push_tail(&worklist, block); - } + bi_worklist_push_tail(&worklist, block); + } - while (!u_worklist_is_empty(&worklist)) { - /* Pop off in reverse order since liveness is backwards */ - bi_block *blk = bi_worklist_pop_tail(&worklist); + while (!u_worklist_is_empty(&worklist)) { + /* Pop off in reverse order since liveness is backwards */ + bi_block *blk = bi_worklist_pop_tail(&worklist); - /* Update liveness information. If we made progress, we need to - * reprocess the predecessors - */ - if (bi_postra_liveness_block(blk)) { - bi_foreach_predecessor(blk, pred) - bi_worklist_push_head(&worklist, *pred); - } - } + /* Update liveness information. If we made progress, we need to + * reprocess the predecessors + */ + if (bi_postra_liveness_block(blk)) { + bi_foreach_predecessor(blk, pred) + bi_worklist_push_head(&worklist, *pred); + } + } - u_worklist_fini(&worklist); + u_worklist_fini(&worklist); } void bi_opt_dce_post_ra(bi_context *ctx) { - bi_postra_liveness(ctx); + bi_postra_liveness(ctx); - bi_foreach_block_rev(ctx, block) { - uint64_t live = block->reg_live_out; + bi_foreach_block_rev(ctx, block) { + uint64_t live = block->reg_live_out; - bi_foreach_instr_in_block_rev(block, ins) { - if (ins->op == BI_OPCODE_DTSEL_IMM) - ins->dest[0] = bi_null(); + bi_foreach_instr_in_block_rev(block, ins) { + if (ins->op == BI_OPCODE_DTSEL_IMM) + ins->dest[0] = bi_null(); - bi_foreach_dest(ins, d) { - if (ins->dest[d].type != BI_INDEX_REGISTER) - continue; + bi_foreach_dest(ins, d) { + if (ins->dest[d].type != BI_INDEX_REGISTER) + continue; - unsigned nr = bi_count_write_registers(ins, d); - unsigned reg = ins->dest[d].value; - uint64_t mask = (BITFIELD64_MASK(nr) << reg); - bool cullable = (ins->op != BI_OPCODE_BLEND); - cullable &= !bi_opcode_props[ins->op].sr_write; + unsigned nr = bi_count_write_registers(ins, d); + unsigned reg = ins->dest[d].value; + uint64_t mask = (BITFIELD64_MASK(nr) << reg); + bool cullable = (ins->op != BI_OPCODE_BLEND); + cullable &= !bi_opcode_props[ins->op].sr_write; - if (!(live & mask) && cullable) - ins->dest[d] = bi_null(); - } + if (!(live & mask) && cullable) + ins->dest[d] = bi_null(); + } - live = bi_postra_liveness_ins(live, ins); - } - } + live = bi_postra_liveness_ins(live, ins); + } + } } diff --git a/src/panfrost/bifrost/bi_opt_dual_tex.c b/src/panfrost/bifrost/bi_opt_dual_tex.c index 65fbd355949..44f4dddffba 100644 --- a/src/panfrost/bifrost/bi_opt_dual_tex.c +++ b/src/panfrost/bifrost/bi_opt_dual_tex.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" #define XXH_INLINE_ALL #include "util/xxhash.h" @@ -51,58 +51,60 @@ static inline bool bi_can_fuse_dual_tex(bi_instr *I, bool fuse_zero_lod) { - return (I->op == BI_OPCODE_TEXS_2D_F32 || I->op == BI_OPCODE_TEXS_2D_F16) && - (I->texture_index < 4 && I->sampler_index < 4) && - (I->lod_mode == fuse_zero_lod); + return (I->op == BI_OPCODE_TEXS_2D_F32 || I->op == BI_OPCODE_TEXS_2D_F16) && + (I->texture_index < 4 && I->sampler_index < 4) && + (I->lod_mode == fuse_zero_lod); } static enum bifrost_texture_format bi_format_for_texs_2d(enum bi_opcode op) { - switch (op) { - case BI_OPCODE_TEXS_2D_F32: return BIFROST_TEXTURE_FORMAT_F32; - case BI_OPCODE_TEXS_2D_F16: return BIFROST_TEXTURE_FORMAT_F16; - default: unreachable("Invalid TEXS_2D instruction"); - } + switch (op) { + case BI_OPCODE_TEXS_2D_F32: + return BIFROST_TEXTURE_FORMAT_F32; + case BI_OPCODE_TEXS_2D_F16: + return BIFROST_TEXTURE_FORMAT_F16; + default: + unreachable("Invalid TEXS_2D instruction"); + } } static void bi_fuse_dual(bi_context *ctx, bi_instr *I1, bi_instr *I2) { - /* Construct a texture operation descriptor for the dual texture */ - struct bifrost_dual_texture_operation desc = { - .mode = BIFROST_TEXTURE_OPERATION_DUAL, + /* Construct a texture operation descriptor for the dual texture */ + struct bifrost_dual_texture_operation desc = { + .mode = BIFROST_TEXTURE_OPERATION_DUAL, - .primary_texture_index = I1->texture_index, - .primary_sampler_index = I1->sampler_index, - .primary_format = bi_format_for_texs_2d(I1->op), - .primary_mask = 0xF, + .primary_texture_index = I1->texture_index, + .primary_sampler_index = I1->sampler_index, + .primary_format = bi_format_for_texs_2d(I1->op), + .primary_mask = 0xF, - .secondary_texture_index = I2->texture_index, - .secondary_sampler_index = I2->sampler_index, - .secondary_format = bi_format_for_texs_2d(I2->op), - .secondary_mask = 0xF, - }; + .secondary_texture_index = I2->texture_index, + .secondary_sampler_index = I2->sampler_index, + .secondary_format = bi_format_for_texs_2d(I2->op), + .secondary_mask = 0xF, + }; - /* LOD mode is implied in a shader stage */ - assert(I1->lod_mode == I2->lod_mode); + /* LOD mode is implied in a shader stage */ + assert(I1->lod_mode == I2->lod_mode); - /* Insert before the earlier instruction in case its result is consumed - * before the later instruction - */ - bi_builder b = bi_init_builder(ctx, bi_before_instr(I1)); + /* Insert before the earlier instruction in case its result is consumed + * before the later instruction + */ + bi_builder b = bi_init_builder(ctx, bi_before_instr(I1)); - bi_instr *I = bi_texc_dual_to(&b, - I1->dest[0], I2->dest[0], bi_null(), /* staging */ - I1->src[0], I1->src[1], /* coordinates */ - bi_imm_u32(bi_dual_tex_as_u32(desc)), I1->lod_mode, - bi_count_write_registers(I1, 0), - bi_count_write_registers(I2, 0)); + bi_instr *I = bi_texc_dual_to( + &b, I1->dest[0], I2->dest[0], bi_null(), /* staging */ + I1->src[0], I1->src[1], /* coordinates */ + bi_imm_u32(bi_dual_tex_as_u32(desc)), I1->lod_mode, + bi_count_write_registers(I1, 0), bi_count_write_registers(I2, 0)); - I->skip = I1->skip && I2->skip; + I->skip = I1->skip && I2->skip; - bi_remove_instruction(I1); - bi_remove_instruction(I2); + bi_remove_instruction(I1); + bi_remove_instruction(I2); } #define HASH(hash, data) XXH32(&(data), sizeof(data), hash) @@ -110,45 +112,45 @@ bi_fuse_dual(bi_context *ctx, bi_instr *I1, bi_instr *I2) static uint32_t coord_hash(const void *key) { - const bi_instr *I = key; + const bi_instr *I = key; - return XXH32(&I->src[0], sizeof(I->src[0]) + sizeof(I->src[1]), 0); + return XXH32(&I->src[0], sizeof(I->src[0]) + sizeof(I->src[1]), 0); } static bool coord_equal(const void *key1, const void *key2) { - const bi_instr *I = key1; - const bi_instr *J = key2; + const bi_instr *I = key1; + const bi_instr *J = key2; - return memcmp(&I->src[0], &J->src[0], - sizeof(I->src[0]) + sizeof(I->src[1])) == 0; + return memcmp(&I->src[0], &J->src[0], + sizeof(I->src[0]) + sizeof(I->src[1])) == 0; } static void bi_opt_fuse_dual_texture_block(bi_context *ctx, bi_block *block) { - struct set *set = _mesa_set_create(ctx, coord_hash, coord_equal); - bool fuse_zero_lod = (ctx->stage != MESA_SHADER_FRAGMENT); - bool found = false; + struct set *set = _mesa_set_create(ctx, coord_hash, coord_equal); + bool fuse_zero_lod = (ctx->stage != MESA_SHADER_FRAGMENT); + bool found = false; - bi_foreach_instr_in_block_safe(block, I) { - if (!bi_can_fuse_dual_tex(I, fuse_zero_lod)) continue; + bi_foreach_instr_in_block_safe(block, I) { + if (!bi_can_fuse_dual_tex(I, fuse_zero_lod)) + continue; - struct set_entry *ent = _mesa_set_search_or_add(set, I, &found); + struct set_entry *ent = _mesa_set_search_or_add(set, I, &found); - if (found) { - bi_fuse_dual(ctx, (bi_instr *) ent->key, I); - _mesa_set_remove(set, ent); - } - } + if (found) { + bi_fuse_dual(ctx, (bi_instr *)ent->key, I); + _mesa_set_remove(set, ent); + } + } } void bi_opt_fuse_dual_texture(bi_context *ctx) { - bi_foreach_block(ctx, block) { - bi_opt_fuse_dual_texture_block(ctx, block); - } - + bi_foreach_block(ctx, block) { + bi_opt_fuse_dual_texture_block(ctx, block); + } } diff --git a/src/panfrost/bifrost/bi_opt_message_preload.c b/src/panfrost/bifrost/bi_opt_message_preload.c index 1ca283db1c0..80ebc57ff31 100644 --- a/src/panfrost/bifrost/bi_opt_message_preload.c +++ b/src/panfrost/bifrost/bi_opt_message_preload.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" /* Bifrost v7 can preload up to two messages of the form: * @@ -35,8 +35,8 @@ static bool bi_is_regfmt_float(enum bi_register_format regfmt) { - return (regfmt == BI_REGISTER_FORMAT_F32) || - (regfmt == BI_REGISTER_FORMAT_F16); + return (regfmt == BI_REGISTER_FORMAT_F32) || + (regfmt == BI_REGISTER_FORMAT_F16); } /* @@ -46,107 +46,107 @@ bi_is_regfmt_float(enum bi_register_format regfmt) static bool bi_can_interp_at_sample(bi_instr *I) { - /* .sample mode with r61 corresponds to per-sample interpolation */ - if (I->sample == BI_SAMPLE_SAMPLE) - return bi_is_value_equiv(I->src[0], bi_register(61)); + /* .sample mode with r61 corresponds to per-sample interpolation */ + if (I->sample == BI_SAMPLE_SAMPLE) + return bi_is_value_equiv(I->src[0], bi_register(61)); - /* If the shader runs with pixel-frequency shading, .sample is - * equivalent to .center, so allow .center - * - * If the shader runs with sample-frequency shading, .sample and .center - * are not equivalent. However, the ESSL 3.20 specification - * stipulates in section 4.5 ("Interpolation Qualifiers"): - * - * for fragment shader input variables qualified with neither - * centroid nor sample, the value of the assigned variable may be - * interpolated anywhere within the pixel and a single value may be - * assigned to each sample within the pixel, to the extent permitted - * by the OpenGL ES Specification. - * - * We only produce .center for variables qualified with neither centroid - * nor sample, so if .center is specified this section applies. This - * suggests that, although per-pixel interpolation is allowed, it is not - * mandated ("may" rather than "must" or "should"). Therefore it appears - * safe to substitute sample. - */ - return (I->sample == BI_SAMPLE_CENTER); + /* If the shader runs with pixel-frequency shading, .sample is + * equivalent to .center, so allow .center + * + * If the shader runs with sample-frequency shading, .sample and .center + * are not equivalent. However, the ESSL 3.20 specification + * stipulates in section 4.5 ("Interpolation Qualifiers"): + * + * for fragment shader input variables qualified with neither + * centroid nor sample, the value of the assigned variable may be + * interpolated anywhere within the pixel and a single value may be + * assigned to each sample within the pixel, to the extent permitted + * by the OpenGL ES Specification. + * + * We only produce .center for variables qualified with neither centroid + * nor sample, so if .center is specified this section applies. This + * suggests that, although per-pixel interpolation is allowed, it is not + * mandated ("may" rather than "must" or "should"). Therefore it appears + * safe to substitute sample. + */ + return (I->sample == BI_SAMPLE_CENTER); } static bool bi_can_preload_ld_var(bi_instr *I) { - return (I->op == BI_OPCODE_LD_VAR_IMM) && - bi_can_interp_at_sample(I) && - bi_is_regfmt_float(I->register_format); + return (I->op == BI_OPCODE_LD_VAR_IMM) && bi_can_interp_at_sample(I) && + bi_is_regfmt_float(I->register_format); } static bool bi_is_var_tex(enum bi_opcode op) { - return (op == BI_OPCODE_VAR_TEX_F32) || (op == BI_OPCODE_VAR_TEX_F16); + return (op == BI_OPCODE_VAR_TEX_F32) || (op == BI_OPCODE_VAR_TEX_F16); } void bi_opt_message_preload(bi_context *ctx) { - unsigned nr_preload = 0; + unsigned nr_preload = 0; - /* We only preload from the first block */ - bi_block *block = bi_start_block(&ctx->blocks); - bi_builder b = bi_init_builder(ctx, bi_before_nonempty_block(block)); + /* We only preload from the first block */ + bi_block *block = bi_start_block(&ctx->blocks); + bi_builder b = bi_init_builder(ctx, bi_before_nonempty_block(block)); - bi_foreach_instr_in_block_safe(block, I) { - if (I->nr_dests != 1) continue; + bi_foreach_instr_in_block_safe(block, I) { + if (I->nr_dests != 1) + continue; - struct bifrost_message_preload msg; + struct bifrost_message_preload msg; - if (bi_can_preload_ld_var(I)) { - msg = (struct bifrost_message_preload) { - .enabled = true, - .varying_index = I->varying_index, - .fp16 = (I->register_format == BI_REGISTER_FORMAT_F16), - .num_components = I->vecsize + 1, - }; - } else if (bi_is_var_tex(I->op)) { - msg = (struct bifrost_message_preload) { - .enabled = true, - .texture = true, - .varying_index = I->varying_index, - .texture_index = I->texture_index, - .fp16 = (I->op == BI_OPCODE_VAR_TEX_F16), - .skip = I->skip, - .zero_lod = I->lod_mode, - }; - } else { - continue; - } + if (bi_can_preload_ld_var(I)) { + msg = (struct bifrost_message_preload){ + .enabled = true, + .varying_index = I->varying_index, + .fp16 = (I->register_format == BI_REGISTER_FORMAT_F16), + .num_components = I->vecsize + 1, + }; + } else if (bi_is_var_tex(I->op)) { + msg = (struct bifrost_message_preload){ + .enabled = true, + .texture = true, + .varying_index = I->varying_index, + .texture_index = I->texture_index, + .fp16 = (I->op == BI_OPCODE_VAR_TEX_F16), + .skip = I->skip, + .zero_lod = I->lod_mode, + }; + } else { + continue; + } - /* Report the preloading */ - ctx->info.bifrost->messages[nr_preload] = msg; + /* Report the preloading */ + ctx->info.bifrost->messages[nr_preload] = msg; - /* Replace with a collect of preloaded registers. The collect - * kills the moves, so the collect is free (it is coalesced). - */ - b.cursor = bi_before_instr(I); + /* Replace with a collect of preloaded registers. The collect + * kills the moves, so the collect is free (it is coalesced). + */ + b.cursor = bi_before_instr(I); - unsigned nr = bi_count_write_registers(I, 0); - bi_instr *collect = bi_collect_i32_to(&b, I->dest[0], nr); + unsigned nr = bi_count_write_registers(I, 0); + bi_instr *collect = bi_collect_i32_to(&b, I->dest[0], nr); - /* The registers themselves must be preloaded at the start of - * the program. Preloaded registers are coalesced, so these - * moves are free. - */ - b.cursor = bi_before_block(block); - bi_foreach_src(collect, i) { - unsigned reg = (nr_preload * 4) + i; + /* The registers themselves must be preloaded at the start of + * the program. Preloaded registers are coalesced, so these + * moves are free. + */ + b.cursor = bi_before_block(block); + bi_foreach_src(collect, i) { + unsigned reg = (nr_preload * 4) + i; - collect->src[i] = bi_mov_i32(&b, bi_register(reg)); - } + collect->src[i] = bi_mov_i32(&b, bi_register(reg)); + } - bi_remove_instruction(I); + bi_remove_instruction(I); - /* Maximum number of preloaded messages */ - if ((++nr_preload) == 2) - break; - } + /* Maximum number of preloaded messages */ + if ((++nr_preload) == 2) + break; + } } diff --git a/src/panfrost/bifrost/bi_opt_mod_props.c b/src/panfrost/bifrost/bi_opt_mod_props.c index 4888972353b..2784a326feb 100644 --- a/src/panfrost/bifrost/bi_opt_mod_props.c +++ b/src/panfrost/bifrost/bi_opt_mod_props.c @@ -22,8 +22,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" /* * Due to a Bifrost encoding restriction, some instructions cannot have an abs @@ -33,76 +33,76 @@ static bool bi_would_impact_abs(unsigned arch, bi_instr *I, bi_index repl, unsigned s) { - return (arch <= 8) && I->src[1 - s].abs && - bi_is_word_equiv(I->src[1 - s], repl); + return (arch <= 8) && I->src[1 - s].abs && + bi_is_word_equiv(I->src[1 - s], repl); } static bool bi_takes_fabs(unsigned arch, bi_instr *I, bi_index repl, unsigned s) { - switch (I->op) { - case BI_OPCODE_FCMP_V2F16: - case BI_OPCODE_FMAX_V2F16: - case BI_OPCODE_FMIN_V2F16: - return !bi_would_impact_abs(arch, I, repl, s); - case BI_OPCODE_FADD_V2F16: - /* - * For FADD.v2f16, the FMA pipe has the abs encoding hazard, - * while the FADD pipe cannot encode a clamp. Either case in - * isolation can be worked around in the scheduler, but both - * together is impossible to encode. Avoid the hazard. - */ - return !(I->clamp && bi_would_impact_abs(arch, I, repl, s)); - case BI_OPCODE_V2F32_TO_V2F16: - /* TODO: Needs both match or lower */ - return false; - case BI_OPCODE_FLOG_TABLE_F32: - /* TODO: Need to check mode */ - return false; - default: - return bi_opcode_props[I->op].abs & BITFIELD_BIT(s); - } + switch (I->op) { + case BI_OPCODE_FCMP_V2F16: + case BI_OPCODE_FMAX_V2F16: + case BI_OPCODE_FMIN_V2F16: + return !bi_would_impact_abs(arch, I, repl, s); + case BI_OPCODE_FADD_V2F16: + /* + * For FADD.v2f16, the FMA pipe has the abs encoding hazard, + * while the FADD pipe cannot encode a clamp. Either case in + * isolation can be worked around in the scheduler, but both + * together is impossible to encode. Avoid the hazard. + */ + return !(I->clamp && bi_would_impact_abs(arch, I, repl, s)); + case BI_OPCODE_V2F32_TO_V2F16: + /* TODO: Needs both match or lower */ + return false; + case BI_OPCODE_FLOG_TABLE_F32: + /* TODO: Need to check mode */ + return false; + default: + return bi_opcode_props[I->op].abs & BITFIELD_BIT(s); + } } static bool bi_takes_fneg(unsigned arch, bi_instr *I, unsigned s) { - switch (I->op) { - case BI_OPCODE_CUBE_SSEL: - case BI_OPCODE_CUBE_TSEL: - case BI_OPCODE_CUBEFACE: - /* TODO: Bifrost encoding restriction: need to match or lower */ - return arch >= 9; - case BI_OPCODE_FREXPE_F32: - case BI_OPCODE_FREXPE_V2F16: - case BI_OPCODE_FLOG_TABLE_F32: - /* TODO: Need to check mode */ - return false; - default: - return bi_opcode_props[I->op].neg & BITFIELD_BIT(s); - } + switch (I->op) { + case BI_OPCODE_CUBE_SSEL: + case BI_OPCODE_CUBE_TSEL: + case BI_OPCODE_CUBEFACE: + /* TODO: Bifrost encoding restriction: need to match or lower */ + return arch >= 9; + case BI_OPCODE_FREXPE_F32: + case BI_OPCODE_FREXPE_V2F16: + case BI_OPCODE_FLOG_TABLE_F32: + /* TODO: Need to check mode */ + return false; + default: + return bi_opcode_props[I->op].neg & BITFIELD_BIT(s); + } } static bool bi_is_fabsneg(enum bi_opcode op, enum bi_size size) { - return (size == BI_SIZE_32 && op == BI_OPCODE_FABSNEG_F32) || - (size == BI_SIZE_16 && op == BI_OPCODE_FABSNEG_V2F16); + return (size == BI_SIZE_32 && op == BI_OPCODE_FABSNEG_F32) || + (size == BI_SIZE_16 && op == BI_OPCODE_FABSNEG_V2F16); } static enum bi_swizzle bi_compose_swizzle_16(enum bi_swizzle a, enum bi_swizzle b) { - assert(a <= BI_SWIZZLE_H11); - assert(b <= BI_SWIZZLE_H11); + assert(a <= BI_SWIZZLE_H11); + assert(b <= BI_SWIZZLE_H11); - bool al = (a & BI_SWIZZLE_H10); - bool ar = (a & BI_SWIZZLE_H01); - bool bl = (b & BI_SWIZZLE_H10); - bool br = (b & BI_SWIZZLE_H01); + bool al = (a & BI_SWIZZLE_H10); + bool ar = (a & BI_SWIZZLE_H01); + bool bl = (b & BI_SWIZZLE_H10); + bool br = (b & BI_SWIZZLE_H01); - return ((al ? br : bl) ? BI_SWIZZLE_H10 : 0) | - ((ar ? br : bl) ? BI_SWIZZLE_H01 : 0); + return ((al ? br : bl) ? BI_SWIZZLE_H10 : 0) | + ((ar ? br : bl) ? BI_SWIZZLE_H01 : 0); } /* Like bi_replace_index, but composes instead of overwrites */ @@ -110,17 +110,17 @@ bi_compose_swizzle_16(enum bi_swizzle a, enum bi_swizzle b) static inline bi_index bi_compose_float_index(bi_index old, bi_index repl) { - /* abs(-x) = abs(+x) so ignore repl.neg if old.abs is set, otherwise - * -(-x) = x but -(+x) = +(-x) so need to exclusive-or the negates */ - repl.neg = old.neg ^ (repl.neg && !old.abs); + /* abs(-x) = abs(+x) so ignore repl.neg if old.abs is set, otherwise + * -(-x) = x but -(+x) = +(-x) so need to exclusive-or the negates */ + repl.neg = old.neg ^ (repl.neg && !old.abs); - /* +/- abs(+/- abs(x)) = +/- abs(x), etc so just or the two */ - repl.abs |= old.abs; + /* +/- abs(+/- abs(x)) = +/- abs(x), etc so just or the two */ + repl.abs |= old.abs; - /* Use the old swizzle to select from the replacement swizzle */ - repl.swizzle = bi_compose_swizzle_16(old.swizzle, repl.swizzle); + /* Use the old swizzle to select from the replacement swizzle */ + repl.swizzle = bi_compose_swizzle_16(old.swizzle, repl.swizzle); - return repl; + return repl; } /* DISCARD.b32(FCMP.f(x, y)) --> DISCARD.f(x, y) */ @@ -128,30 +128,35 @@ bi_compose_float_index(bi_index old, bi_index repl) static inline bool bi_fuse_discard_fcmp(bi_context *ctx, bi_instr *I, bi_instr *mod) { - if (!mod) return false; - if (I->op != BI_OPCODE_DISCARD_B32) return false; - if (mod->op != BI_OPCODE_FCMP_F32 && mod->op != BI_OPCODE_FCMP_V2F16) return false; - if (mod->cmpf >= BI_CMPF_GTLT) return false; + if (!mod) + return false; + if (I->op != BI_OPCODE_DISCARD_B32) + return false; + if (mod->op != BI_OPCODE_FCMP_F32 && mod->op != BI_OPCODE_FCMP_V2F16) + return false; + if (mod->cmpf >= BI_CMPF_GTLT) + return false; - /* result_type doesn't matter */ + /* result_type doesn't matter */ - /* .abs and .neg modifiers allowed on Valhall DISCARD but not Bifrost */ - bool absneg = mod->src[0].neg || mod->src[0].abs; - absneg |= mod->src[1].neg || mod->src[1].abs; + /* .abs and .neg modifiers allowed on Valhall DISCARD but not Bifrost */ + bool absneg = mod->src[0].neg || mod->src[0].abs; + absneg |= mod->src[1].neg || mod->src[1].abs; - if (ctx->arch <= 8 && absneg) return false; + if (ctx->arch <= 8 && absneg) + return false; - enum bi_swizzle r = I->src[0].swizzle; + enum bi_swizzle r = I->src[0].swizzle; - bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - I = bi_discard_f32(&b, mod->src[0], mod->src[1], mod->cmpf); + bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); + I = bi_discard_f32(&b, mod->src[0], mod->src[1], mod->cmpf); - if (mod->op == BI_OPCODE_FCMP_V2F16) { - I->src[0].swizzle = bi_compose_swizzle_16(r, I->src[0].swizzle); - I->src[1].swizzle = bi_compose_swizzle_16(r, I->src[1].swizzle); - } + if (mod->op == BI_OPCODE_FCMP_V2F16) { + I->src[0].swizzle = bi_compose_swizzle_16(r, I->src[0].swizzle); + I->src[1].swizzle = bi_compose_swizzle_16(r, I->src[1].swizzle); + } - return true; + return true; } /* @@ -159,80 +164,80 @@ bi_fuse_discard_fcmp(bi_context *ctx, bi_instr *I, bi_instr *mod) * because all 8-bit and 16-bit integers may be represented exactly as fp32. */ struct { - enum bi_opcode inner; - enum bi_opcode outer; - enum bi_opcode replacement; + enum bi_opcode inner; + enum bi_opcode outer; + enum bi_opcode replacement; } bi_small_int_patterns[] = { - { BI_OPCODE_S8_TO_S32, BI_OPCODE_S32_TO_F32, BI_OPCODE_S8_TO_F32 }, - { BI_OPCODE_U8_TO_U32, BI_OPCODE_U32_TO_F32, BI_OPCODE_U8_TO_F32 }, - { BI_OPCODE_U8_TO_U32, BI_OPCODE_S32_TO_F32, BI_OPCODE_U8_TO_F32 }, - { BI_OPCODE_S16_TO_S32, BI_OPCODE_S32_TO_F32, BI_OPCODE_S16_TO_F32 }, - { BI_OPCODE_U16_TO_U32, BI_OPCODE_U32_TO_F32, BI_OPCODE_U16_TO_F32 }, - { BI_OPCODE_U16_TO_U32, BI_OPCODE_S32_TO_F32, BI_OPCODE_U16_TO_F32 }, + {BI_OPCODE_S8_TO_S32, BI_OPCODE_S32_TO_F32, BI_OPCODE_S8_TO_F32}, + {BI_OPCODE_U8_TO_U32, BI_OPCODE_U32_TO_F32, BI_OPCODE_U8_TO_F32}, + {BI_OPCODE_U8_TO_U32, BI_OPCODE_S32_TO_F32, BI_OPCODE_U8_TO_F32}, + {BI_OPCODE_S16_TO_S32, BI_OPCODE_S32_TO_F32, BI_OPCODE_S16_TO_F32}, + {BI_OPCODE_U16_TO_U32, BI_OPCODE_U32_TO_F32, BI_OPCODE_U16_TO_F32}, + {BI_OPCODE_U16_TO_U32, BI_OPCODE_S32_TO_F32, BI_OPCODE_U16_TO_F32}, }; static inline void bi_fuse_small_int_to_f32(bi_instr *I, bi_instr *mod) { - for (unsigned i = 0; i < ARRAY_SIZE(bi_small_int_patterns); ++i) { - if (I->op != bi_small_int_patterns[i].outer) - continue; - if (mod->op != bi_small_int_patterns[i].inner) - continue; + for (unsigned i = 0; i < ARRAY_SIZE(bi_small_int_patterns); ++i) { + if (I->op != bi_small_int_patterns[i].outer) + continue; + if (mod->op != bi_small_int_patterns[i].inner) + continue; - assert(I->src[0].swizzle == BI_SWIZZLE_H01); - I->src[0] = mod->src[0]; - I->round = BI_ROUND_NONE; - I->op = bi_small_int_patterns[i].replacement; - } + assert(I->src[0].swizzle == BI_SWIZZLE_H01); + I->src[0] = mod->src[0]; + I->round = BI_ROUND_NONE; + I->op = bi_small_int_patterns[i].replacement; + } } void bi_opt_mod_prop_forward(bi_context *ctx) { - bi_instr **lut = calloc(sizeof(bi_instr *), ctx->ssa_alloc); + bi_instr **lut = calloc(sizeof(bi_instr *), ctx->ssa_alloc); - bi_foreach_instr_global_safe(ctx, I) { - /* Try fusing FCMP into DISCARD.b32, building a new DISCARD.f32 - * instruction. As this is the only optimization DISCARD is - * involved in, this shortcircuits other processing. - */ - if (I->op == BI_OPCODE_DISCARD_B32) { - if (bi_is_ssa(I->src[0]) && - bi_fuse_discard_fcmp(ctx, I, lut[I->src[0].value])) { - bi_remove_instruction(I); - } + bi_foreach_instr_global_safe(ctx, I) { + /* Try fusing FCMP into DISCARD.b32, building a new DISCARD.f32 + * instruction. As this is the only optimization DISCARD is + * involved in, this shortcircuits other processing. + */ + if (I->op == BI_OPCODE_DISCARD_B32) { + if (bi_is_ssa(I->src[0]) && + bi_fuse_discard_fcmp(ctx, I, lut[I->src[0].value])) { + bi_remove_instruction(I); + } - continue; - } + continue; + } - bi_foreach_dest(I, d) { - lut[I->dest[d].value] = I; - } + bi_foreach_dest(I, d) { + lut[I->dest[d].value] = I; + } - bi_foreach_ssa_src(I, s) { - bi_instr *mod = lut[I->src[s].value]; + bi_foreach_ssa_src(I, s) { + bi_instr *mod = lut[I->src[s].value]; - if (!mod) - continue; + if (!mod) + continue; - unsigned size = bi_opcode_props[I->op].size; + unsigned size = bi_opcode_props[I->op].size; - bi_fuse_small_int_to_f32(I, mod); + bi_fuse_small_int_to_f32(I, mod); - if (bi_is_fabsneg(mod->op, size)) { - if (mod->src[0].abs && !bi_takes_fabs(ctx->arch, I, mod->src[0], s)) - continue; + if (bi_is_fabsneg(mod->op, size)) { + if (mod->src[0].abs && !bi_takes_fabs(ctx->arch, I, mod->src[0], s)) + continue; - if (mod->src[0].neg && !bi_takes_fneg(ctx->arch, I, s)) - continue; + if (mod->src[0].neg && !bi_takes_fneg(ctx->arch, I, s)) + continue; - I->src[s] = bi_compose_float_index(I->src[s], mod->src[0]); - } - } - } + I->src[s] = bi_compose_float_index(I->src[s], mod->src[0]); + } + } + } - free(lut); + free(lut); } /* RSCALE has restrictions on how the clamp may be used, only used for @@ -241,199 +246,207 @@ bi_opt_mod_prop_forward(bi_context *ctx) static bool bi_takes_clamp(bi_instr *I) { - switch (I->op) { - case BI_OPCODE_FMA_RSCALE_F32: - case BI_OPCODE_FMA_RSCALE_V2F16: - case BI_OPCODE_FADD_RSCALE_F32: - return false; - case BI_OPCODE_FADD_V2F16: - /* Encoding restriction */ - return !(I->src[0].abs && I->src[1].abs && - bi_is_word_equiv(I->src[0], I->src[1])); - default: - return bi_opcode_props[I->op].clamp; - } + switch (I->op) { + case BI_OPCODE_FMA_RSCALE_F32: + case BI_OPCODE_FMA_RSCALE_V2F16: + case BI_OPCODE_FADD_RSCALE_F32: + return false; + case BI_OPCODE_FADD_V2F16: + /* Encoding restriction */ + return !(I->src[0].abs && I->src[1].abs && + bi_is_word_equiv(I->src[0], I->src[1])); + default: + return bi_opcode_props[I->op].clamp; + } } static bool bi_is_fclamp(enum bi_opcode op, enum bi_size size) { - return (size == BI_SIZE_32 && op == BI_OPCODE_FCLAMP_F32) || - (size == BI_SIZE_16 && op == BI_OPCODE_FCLAMP_V2F16); + return (size == BI_SIZE_32 && op == BI_OPCODE_FCLAMP_F32) || + (size == BI_SIZE_16 && op == BI_OPCODE_FCLAMP_V2F16); } static bool bi_optimizer_clamp(bi_instr *I, bi_instr *use) { - if (!bi_is_fclamp(use->op, bi_opcode_props[I->op].size)) return false; - if (!bi_takes_clamp(I)) return false; + if (!bi_is_fclamp(use->op, bi_opcode_props[I->op].size)) + return false; + if (!bi_takes_clamp(I)) + return false; - /* Clamps are bitfields (clamp_m1_1/clamp_0_inf) so composition is OR */ - I->clamp |= use->clamp; - I->dest[0] = use->dest[0]; - return true; + /* Clamps are bitfields (clamp_m1_1/clamp_0_inf) so composition is OR */ + I->clamp |= use->clamp; + I->dest[0] = use->dest[0]; + return true; } static enum bi_opcode bi_sized_mux_op(unsigned size) { - switch (size) { - case 8: return BI_OPCODE_MUX_V4I8; - case 16: return BI_OPCODE_MUX_V2I16; - case 32: return BI_OPCODE_MUX_I32; - default: unreachable("invalid size"); - } + switch (size) { + case 8: + return BI_OPCODE_MUX_V4I8; + case 16: + return BI_OPCODE_MUX_V2I16; + case 32: + return BI_OPCODE_MUX_I32; + default: + unreachable("invalid size"); + } } static bool bi_is_fixed_mux(bi_instr *I, unsigned size, bi_index v1) { - return I->op == bi_sized_mux_op(size) && - bi_is_value_equiv(I->src[0], bi_zero()) && - bi_is_value_equiv(I->src[1], v1); + return I->op == bi_sized_mux_op(size) && + bi_is_value_equiv(I->src[0], bi_zero()) && + bi_is_value_equiv(I->src[1], v1); } static bool bi_takes_int_result_type(enum bi_opcode op) { - switch (op) { - case BI_OPCODE_ICMP_I32: - case BI_OPCODE_ICMP_S32: - case BI_OPCODE_ICMP_U32: - case BI_OPCODE_ICMP_V2I16: - case BI_OPCODE_ICMP_V2S16: - case BI_OPCODE_ICMP_V2U16: - case BI_OPCODE_ICMP_V4I8: - case BI_OPCODE_ICMP_V4S8: - case BI_OPCODE_ICMP_V4U8: - case BI_OPCODE_FCMP_F32: - case BI_OPCODE_FCMP_V2F16: - return true; - default: - return false; - } + switch (op) { + case BI_OPCODE_ICMP_I32: + case BI_OPCODE_ICMP_S32: + case BI_OPCODE_ICMP_U32: + case BI_OPCODE_ICMP_V2I16: + case BI_OPCODE_ICMP_V2S16: + case BI_OPCODE_ICMP_V2U16: + case BI_OPCODE_ICMP_V4I8: + case BI_OPCODE_ICMP_V4S8: + case BI_OPCODE_ICMP_V4U8: + case BI_OPCODE_FCMP_F32: + case BI_OPCODE_FCMP_V2F16: + return true; + default: + return false; + } } static bool bi_takes_float_result_type(enum bi_opcode op) { - return (op == BI_OPCODE_FCMP_F32) || - (op == BI_OPCODE_FCMP_V2F16); + return (op == BI_OPCODE_FCMP_F32) || (op == BI_OPCODE_FCMP_V2F16); } /* CMP+MUX -> CMP with result type */ static bool bi_optimizer_result_type(bi_instr *I, bi_instr *mux) { - if (bi_opcode_props[I->op].size != bi_opcode_props[mux->op].size) - return false; + if (bi_opcode_props[I->op].size != bi_opcode_props[mux->op].size) + return false; - if (bi_is_fixed_mux(mux, 32, bi_imm_f32(1.0)) || - bi_is_fixed_mux(mux, 16, bi_imm_f16(1.0))) { + if (bi_is_fixed_mux(mux, 32, bi_imm_f32(1.0)) || + bi_is_fixed_mux(mux, 16, bi_imm_f16(1.0))) { - if (!bi_takes_float_result_type(I->op)) - return false; + if (!bi_takes_float_result_type(I->op)) + return false; - I->result_type = BI_RESULT_TYPE_F1; - } else if (bi_is_fixed_mux(mux, 32, bi_imm_u32(1)) || - bi_is_fixed_mux(mux, 16, bi_imm_u16(1)) || - bi_is_fixed_mux(mux, 8, bi_imm_u8(1))) { + I->result_type = BI_RESULT_TYPE_F1; + } else if (bi_is_fixed_mux(mux, 32, bi_imm_u32(1)) || + bi_is_fixed_mux(mux, 16, bi_imm_u16(1)) || + bi_is_fixed_mux(mux, 8, bi_imm_u8(1))) { - if (!bi_takes_int_result_type(I->op)) - return false; + if (!bi_takes_int_result_type(I->op)) + return false; - I->result_type = BI_RESULT_TYPE_I1; - } else { - return false; - } + I->result_type = BI_RESULT_TYPE_I1; + } else { + return false; + } - I->dest[0] = mux->dest[0]; - return true; + I->dest[0] = mux->dest[0]; + return true; } static bool bi_is_var_tex(bi_instr *var, bi_instr *tex) { - return (var->op == BI_OPCODE_LD_VAR_IMM) && - (tex->op == BI_OPCODE_TEXS_2D_F16 || tex->op == BI_OPCODE_TEXS_2D_F32) && - (var->register_format == BI_REGISTER_FORMAT_F32) && - ((var->sample == BI_SAMPLE_CENTER && var->update == BI_UPDATE_STORE) || - (var->sample == BI_SAMPLE_NONE && var->update == BI_UPDATE_RETRIEVE)) && - (tex->texture_index == tex->sampler_index) && - (tex->texture_index < 4) && - (var->index < 8); + return (var->op == BI_OPCODE_LD_VAR_IMM) && + (tex->op == BI_OPCODE_TEXS_2D_F16 || + tex->op == BI_OPCODE_TEXS_2D_F32) && + (var->register_format == BI_REGISTER_FORMAT_F32) && + ((var->sample == BI_SAMPLE_CENTER && + var->update == BI_UPDATE_STORE) || + (var->sample == BI_SAMPLE_NONE && + var->update == BI_UPDATE_RETRIEVE)) && + (tex->texture_index == tex->sampler_index) && + (tex->texture_index < 4) && (var->index < 8); } static bool bi_optimizer_var_tex(bi_context *ctx, bi_instr *var, bi_instr *tex) { - if (!bi_is_var_tex(var, tex)) return false; + if (!bi_is_var_tex(var, tex)) + return false; - /* Construct the corresponding VAR_TEX intruction */ - bi_builder b = bi_init_builder(ctx, bi_after_instr(var)); + /* Construct the corresponding VAR_TEX intruction */ + bi_builder b = bi_init_builder(ctx, bi_after_instr(var)); - bi_instr *I = bi_var_tex_f32_to(&b, tex->dest[0], tex->lod_mode, - var->sample, var->update, tex->texture_index, var->index); - I->skip = tex->skip; + bi_instr *I = bi_var_tex_f32_to(&b, tex->dest[0], tex->lod_mode, var->sample, + var->update, tex->texture_index, var->index); + I->skip = tex->skip; - if (tex->op == BI_OPCODE_TEXS_2D_F16) - I->op = BI_OPCODE_VAR_TEX_F16; + if (tex->op == BI_OPCODE_TEXS_2D_F16) + I->op = BI_OPCODE_VAR_TEX_F16; - /* Dead code elimination will clean up for us */ - return true; + /* Dead code elimination will clean up for us */ + return true; } void bi_opt_mod_prop_backward(bi_context *ctx) { - unsigned count = ctx->ssa_alloc; - bi_instr **uses = calloc(count, sizeof(*uses)); - BITSET_WORD *multiple = calloc(BITSET_WORDS(count), sizeof(*multiple)); + unsigned count = ctx->ssa_alloc; + bi_instr **uses = calloc(count, sizeof(*uses)); + BITSET_WORD *multiple = calloc(BITSET_WORDS(count), sizeof(*multiple)); - bi_foreach_instr_global_rev(ctx, I) { - bi_foreach_ssa_src(I, s) { - unsigned v = I->src[s].value; + bi_foreach_instr_global_rev(ctx, I) { + bi_foreach_ssa_src(I, s) { + unsigned v = I->src[s].value; - if (uses[v] && uses[v] != I) - BITSET_SET(multiple, v); - else - uses[v] = I; - } + if (uses[v] && uses[v] != I) + BITSET_SET(multiple, v); + else + uses[v] = I; + } - if (!I->nr_dests) - continue; + if (!I->nr_dests) + continue; - bi_instr *use = uses[I->dest[0].value]; + bi_instr *use = uses[I->dest[0].value]; - if (!use || BITSET_TEST(multiple, I->dest[0].value)) - continue; + if (!use || BITSET_TEST(multiple, I->dest[0].value)) + continue; - /* Destination has a single use, try to propagate */ - bool propagated = - bi_optimizer_clamp(I, use) || - bi_optimizer_result_type(I, use); + /* Destination has a single use, try to propagate */ + bool propagated = + bi_optimizer_clamp(I, use) || bi_optimizer_result_type(I, use); - if (!propagated && I->op == BI_OPCODE_LD_VAR_IMM && use->op == BI_OPCODE_SPLIT_I32) { - /* Need to see through the split in a - * ld_var_imm/split/var_tex sequence - */ - bi_instr *tex = uses[use->dest[0].value]; + if (!propagated && I->op == BI_OPCODE_LD_VAR_IMM && + use->op == BI_OPCODE_SPLIT_I32) { + /* Need to see through the split in a + * ld_var_imm/split/var_tex sequence + */ + bi_instr *tex = uses[use->dest[0].value]; - if (!tex || BITSET_TEST(multiple, use->dest[0].value)) - continue; + if (!tex || BITSET_TEST(multiple, use->dest[0].value)) + continue; - use = tex; - propagated = bi_optimizer_var_tex(ctx, I, use); - } + use = tex; + propagated = bi_optimizer_var_tex(ctx, I, use); + } - if (propagated) { - bi_remove_instruction(use); - continue; - } - } + if (propagated) { + bi_remove_instruction(use); + continue; + } + } - free(uses); - free(multiple); + free(uses); + free(multiple); } /* @@ -443,37 +456,37 @@ bi_opt_mod_prop_backward(bi_context *ctx) static bool bi_lower_opt_instruction_helper(bi_builder *b, bi_instr *I) { - bi_instr *repl; + bi_instr *repl; - switch (I->op) { - case BI_OPCODE_FABSNEG_F32: - case BI_OPCODE_FCLAMP_F32: - repl = bi_fadd_f32_to(b, I->dest[0], I->src[0], bi_negzero()); - repl->clamp = I->clamp; - return true; + switch (I->op) { + case BI_OPCODE_FABSNEG_F32: + case BI_OPCODE_FCLAMP_F32: + repl = bi_fadd_f32_to(b, I->dest[0], I->src[0], bi_negzero()); + repl->clamp = I->clamp; + return true; - case BI_OPCODE_FABSNEG_V2F16: - case BI_OPCODE_FCLAMP_V2F16: - repl = bi_fadd_v2f16_to(b, I->dest[0], I->src[0], bi_negzero()); - repl->clamp = I->clamp; - return true; + case BI_OPCODE_FABSNEG_V2F16: + case BI_OPCODE_FCLAMP_V2F16: + repl = bi_fadd_v2f16_to(b, I->dest[0], I->src[0], bi_negzero()); + repl->clamp = I->clamp; + return true; - case BI_OPCODE_DISCARD_B32: - bi_discard_f32(b, I->src[0], bi_zero(), BI_CMPF_NE); - return true; + case BI_OPCODE_DISCARD_B32: + bi_discard_f32(b, I->src[0], bi_zero(), BI_CMPF_NE); + return true; - default: - return false; - } + default: + return false; + } } void bi_lower_opt_instructions(bi_context *ctx) { - bi_foreach_instr_global_safe(ctx, I) { - bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); + bi_foreach_instr_global_safe(ctx, I) { + bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - if (bi_lower_opt_instruction_helper(&b, I)) - bi_remove_instruction(I); - } + if (bi_lower_opt_instruction_helper(&b, I)) + bi_remove_instruction(I); + } } diff --git a/src/panfrost/bifrost/bi_opt_push_ubo.c b/src/panfrost/bifrost/bi_opt_push_ubo.c index 941993d55fb..01f08635076 100644 --- a/src/panfrost/bifrost/bi_opt_push_ubo.c +++ b/src/panfrost/bifrost/bi_opt_push_ubo.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" /* This optimization pass, intended to run once after code emission but before * copy propagation, analyzes direct word-aligned UBO reads and promotes a @@ -32,17 +32,16 @@ static bool bi_is_ubo(bi_instr *ins) { - return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) && - (ins->seg == BI_SEG_UBO); + return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) && + (ins->seg == BI_SEG_UBO); } static bool bi_is_direct_aligned_ubo(bi_instr *ins) { - return bi_is_ubo(ins) && - (ins->src[0].type == BI_INDEX_CONSTANT) && - (ins->src[1].type == BI_INDEX_CONSTANT) && - ((ins->src[0].value & 0x3) == 0); + return bi_is_ubo(ins) && (ins->src[0].type == BI_INDEX_CONSTANT) && + (ins->src[1].type == BI_INDEX_CONSTANT) && + ((ins->src[0].value & 0x3) == 0); } /* Represents use data for a single UBO */ @@ -50,44 +49,46 @@ bi_is_direct_aligned_ubo(bi_instr *ins) #define MAX_UBO_WORDS (65536 / 16) struct bi_ubo_block { - BITSET_DECLARE(pushed, MAX_UBO_WORDS); - uint8_t range[MAX_UBO_WORDS]; + BITSET_DECLARE(pushed, MAX_UBO_WORDS); + uint8_t range[MAX_UBO_WORDS]; }; struct bi_ubo_analysis { - /* Per block analysis */ - unsigned nr_blocks; - struct bi_ubo_block *blocks; + /* Per block analysis */ + unsigned nr_blocks; + struct bi_ubo_block *blocks; }; static struct bi_ubo_analysis bi_analyze_ranges(bi_context *ctx) { - struct bi_ubo_analysis res = { - .nr_blocks = ctx->nir->info.num_ubos + 1, - }; + struct bi_ubo_analysis res = { + .nr_blocks = ctx->nir->info.num_ubos + 1, + }; - res.blocks = calloc(res.nr_blocks, sizeof(struct bi_ubo_block)); + res.blocks = calloc(res.nr_blocks, sizeof(struct bi_ubo_block)); - bi_foreach_instr_global(ctx, ins) { - if (!bi_is_direct_aligned_ubo(ins)) continue; + bi_foreach_instr_global(ctx, ins) { + if (!bi_is_direct_aligned_ubo(ins)) + continue; - unsigned ubo = ins->src[1].value; - unsigned word = ins->src[0].value / 4; - unsigned channels = bi_opcode_props[ins->op].sr_count; + unsigned ubo = ins->src[1].value; + unsigned word = ins->src[0].value / 4; + unsigned channels = bi_opcode_props[ins->op].sr_count; - assert(ubo < res.nr_blocks); - assert(channels > 0 && channels <= 4); + assert(ubo < res.nr_blocks); + assert(channels > 0 && channels <= 4); - if (word >= MAX_UBO_WORDS) continue; + if (word >= MAX_UBO_WORDS) + continue; - /* Must use max if the same base is read with different channel - * counts, which is possible with nir_opt_shrink_vectors */ - uint8_t *range = res.blocks[ubo].range; - range[word] = MAX2(range[word], channels); - } + /* Must use max if the same base is read with different channel + * counts, which is possible with nir_opt_shrink_vectors */ + uint8_t *range = res.blocks[ubo].range; + range[word] = MAX2(range[word], channels); + } - return res; + return res; } /* Select UBO words to push. A sophisticated implementation would consider the @@ -97,92 +98,93 @@ bi_analyze_ranges(bi_context *ctx) static void bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis) { - for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { - struct bi_ubo_block *block = &analysis->blocks[ubo]; + for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { + struct bi_ubo_block *block = &analysis->blocks[ubo]; - for (unsigned r = 0; r < MAX_UBO_WORDS; ++r) { - unsigned range = block->range[r]; + for (unsigned r = 0; r < MAX_UBO_WORDS; ++r) { + unsigned range = block->range[r]; - /* Don't push something we don't access */ - if (range == 0) continue; + /* Don't push something we don't access */ + if (range == 0) + continue; - /* Don't push more than possible */ - if (push->count > PAN_MAX_PUSH - range) - return; + /* Don't push more than possible */ + if (push->count > PAN_MAX_PUSH - range) + return; - for (unsigned offs = 0; offs < range; ++offs) { - struct panfrost_ubo_word word = { - .ubo = ubo, - .offset = (r + offs) * 4, - }; + for (unsigned offs = 0; offs < range; ++offs) { + struct panfrost_ubo_word word = { + .ubo = ubo, + .offset = (r + offs) * 4, + }; - push->words[push->count++] = word; - } + push->words[push->count++] = word; + } - /* Mark it as pushed so we can rewrite */ - BITSET_SET(block->pushed, r); - } - } + /* Mark it as pushed so we can rewrite */ + BITSET_SET(block->pushed, r); + } + } } void bi_opt_push_ubo(bi_context *ctx) { - struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx); - bi_pick_ubo(ctx->info.push, &analysis); + struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx); + bi_pick_ubo(ctx->info.push, &analysis); - ctx->ubo_mask = 0; + ctx->ubo_mask = 0; - bi_foreach_instr_global_safe(ctx, ins) { - if (!bi_is_ubo(ins)) continue; + bi_foreach_instr_global_safe(ctx, ins) { + if (!bi_is_ubo(ins)) + continue; - unsigned ubo = ins->src[1].value; - unsigned offset = ins->src[0].value; + unsigned ubo = ins->src[1].value; + unsigned offset = ins->src[0].value; - if (!bi_is_direct_aligned_ubo(ins)) { - /* The load can't be pushed, so this UBO needs to be - * uploaded conventionally */ - if (ins->src[1].type == BI_INDEX_CONSTANT) - ctx->ubo_mask |= BITSET_BIT(ubo); - else - ctx->ubo_mask = ~0; + if (!bi_is_direct_aligned_ubo(ins)) { + /* The load can't be pushed, so this UBO needs to be + * uploaded conventionally */ + if (ins->src[1].type == BI_INDEX_CONSTANT) + ctx->ubo_mask |= BITSET_BIT(ubo); + else + ctx->ubo_mask = ~0; - continue; - } + continue; + } - /* Check if we decided to push this */ - assert(ubo < analysis.nr_blocks); - if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) { - ctx->ubo_mask |= BITSET_BIT(ubo); - continue; - } + /* Check if we decided to push this */ + assert(ubo < analysis.nr_blocks); + if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) { + ctx->ubo_mask |= BITSET_BIT(ubo); + continue; + } - /* Replace the UBO load with moves from FAU */ - bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); + /* Replace the UBO load with moves from FAU */ + bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); - unsigned nr = bi_opcode_props[ins->op].sr_count; - bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0], nr); + unsigned nr = bi_opcode_props[ins->op].sr_count; + bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0], nr); - bi_foreach_src(vec, w) { - /* FAU is grouped in pairs (2 x 4-byte) */ - unsigned base = - pan_lookup_pushed_ubo(ctx->info.push, ubo, - (offset + 4 * w)); + bi_foreach_src(vec, w) { + /* FAU is grouped in pairs (2 x 4-byte) */ + unsigned base = + pan_lookup_pushed_ubo(ctx->info.push, ubo, (offset + 4 * w)); - unsigned fau_idx = (base >> 1); - unsigned fau_hi = (base & 1); + unsigned fau_idx = (base >> 1); + unsigned fau_hi = (base & 1); - vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi); - } + vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi); + } - bi_remove_instruction(ins); - } + bi_remove_instruction(ins); + } - free(analysis.blocks); + free(analysis.blocks); } typedef struct { - BITSET_DECLARE(row, PAN_MAX_PUSH); + BITSET_DECLARE(row, PAN_MAX_PUSH); } adjacency_row; /* Find the connected component containing `node` with depth-first search */ @@ -190,33 +192,32 @@ static void bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited, unsigned *component, unsigned *size, unsigned node) { - unsigned neighbour; + unsigned neighbour; - BITSET_SET(visited, node); - component[(*size)++] = node; + BITSET_SET(visited, node); + component[(*size)++] = node; - BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) { - if (!BITSET_TEST(visited, neighbour)) { - bi_find_component(adjacency, visited, component, size, - neighbour); - } - } + BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) { + if (!BITSET_TEST(visited, neighbour)) { + bi_find_component(adjacency, visited, component, size, neighbour); + } + } } static bool bi_is_uniform(bi_index idx) { - return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM); + return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM); } /* Get the index of a uniform in 32-bit words from the start of FAU-RAM */ static unsigned bi_uniform_word(bi_index idx) { - assert(bi_is_uniform(idx)); - assert(idx.offset <= 1); + assert(bi_is_uniform(idx)); + assert(idx.offset <= 1); - return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset; + return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset; } /* @@ -228,35 +229,35 @@ bi_uniform_word(bi_index idx) static void bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency) { - bi_foreach_instr_global(ctx, I) { - unsigned nodes[BI_MAX_SRCS] = {}; - unsigned node_count = 0; + bi_foreach_instr_global(ctx, I) { + unsigned nodes[BI_MAX_SRCS] = {}; + unsigned node_count = 0; - /* Set nodes[] to 32-bit uniforms accessed */ - bi_foreach_src(I, s) { - if (bi_is_uniform(I->src[s])) { - unsigned word = bi_uniform_word(I->src[s]); + /* Set nodes[] to 32-bit uniforms accessed */ + bi_foreach_src(I, s) { + if (bi_is_uniform(I->src[s])) { + unsigned word = bi_uniform_word(I->src[s]); - if (word >= ctx->info.push_offset) - nodes[node_count++] = word; - } - } + if (word >= ctx->info.push_offset) + nodes[node_count++] = word; + } + } - /* Create clique connecting nodes[] */ - for (unsigned i = 0; i < node_count; ++i) { - for (unsigned j = 0; j < node_count; ++j) { - if (i == j) - continue; + /* Create clique connecting nodes[] */ + for (unsigned i = 0; i < node_count; ++i) { + for (unsigned j = 0; j < node_count; ++j) { + if (i == j) + continue; - unsigned x = nodes[i], y = nodes[j]; - assert(MAX2(x, y) < ctx->info.push->count); + unsigned x = nodes[i], y = nodes[j]; + assert(MAX2(x, y) < ctx->info.push->count); - /* Add undirected edge between the nodes */ - BITSET_SET(adjacency[x].row, y); - BITSET_SET(adjacency[y].row, x); - } - } - } + /* Add undirected edge between the nodes */ + BITSET_SET(adjacency[x].row, y); + BITSET_SET(adjacency[y].row, x); + } + } + } } /* @@ -278,71 +279,72 @@ bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency) void bi_opt_reorder_push(bi_context *ctx) { - adjacency_row adjacency[PAN_MAX_PUSH] = { 0 }; - BITSET_DECLARE(visited, PAN_MAX_PUSH) = { 0 }; + adjacency_row adjacency[PAN_MAX_PUSH] = {0}; + BITSET_DECLARE(visited, PAN_MAX_PUSH) = {0}; - unsigned ordering[PAN_MAX_PUSH] = { 0 }; - unsigned unpaired[PAN_MAX_PUSH] = { 0 }; - unsigned pushed = 0, unpaired_count = 0; + unsigned ordering[PAN_MAX_PUSH] = {0}; + unsigned unpaired[PAN_MAX_PUSH] = {0}; + unsigned pushed = 0, unpaired_count = 0; - struct panfrost_ubo_push *push = ctx->info.push; - unsigned push_offset = ctx->info.push_offset; + struct panfrost_ubo_push *push = ctx->info.push; + unsigned push_offset = ctx->info.push_offset; - bi_create_fau_interference_graph(ctx, adjacency); + bi_create_fau_interference_graph(ctx, adjacency); - for (unsigned i = push_offset; i < push->count; ++i) { - if (BITSET_TEST(visited, i)) continue; + for (unsigned i = push_offset; i < push->count; ++i) { + if (BITSET_TEST(visited, i)) + continue; - unsigned component[PAN_MAX_PUSH] = { 0 }; - unsigned size = 0; - bi_find_component(adjacency, visited, component, &size, i); + unsigned component[PAN_MAX_PUSH] = {0}; + unsigned size = 0; + bi_find_component(adjacency, visited, component, &size, i); - /* If there is an odd number of uses, at least one use must be - * unpaired. Arbitrarily take the last one. - */ - if (size % 2) - unpaired[unpaired_count++] = component[--size]; + /* If there is an odd number of uses, at least one use must be + * unpaired. Arbitrarily take the last one. + */ + if (size % 2) + unpaired[unpaired_count++] = component[--size]; - /* The rest of uses are paired */ - assert((size % 2) == 0); + /* The rest of uses are paired */ + assert((size % 2) == 0); - /* Push the paired uses */ - memcpy(ordering + pushed, component, sizeof(unsigned) * size); - pushed += size; - } + /* Push the paired uses */ + memcpy(ordering + pushed, component, sizeof(unsigned) * size); + pushed += size; + } - /* Push unpaired nodes at the end */ - memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count); - pushed += unpaired_count; + /* Push unpaired nodes at the end */ + memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count); + pushed += unpaired_count; - /* Ordering is a permutation. Invert it for O(1) lookup. */ - unsigned old_to_new[PAN_MAX_PUSH] = { 0 }; + /* Ordering is a permutation. Invert it for O(1) lookup. */ + unsigned old_to_new[PAN_MAX_PUSH] = {0}; - for (unsigned i = 0; i < push_offset; ++i) { - old_to_new[i] = i; - } + for (unsigned i = 0; i < push_offset; ++i) { + old_to_new[i] = i; + } - for (unsigned i = 0; i < pushed; ++i) { - assert(ordering[i] >= push_offset); - old_to_new[ordering[i]] = push_offset + i; - } + for (unsigned i = 0; i < pushed; ++i) { + assert(ordering[i] >= push_offset); + old_to_new[ordering[i]] = push_offset + i; + } - /* Use new ordering throughout the program */ - bi_foreach_instr_global(ctx, I) { - bi_foreach_src(I, s) { - if (bi_is_uniform(I->src[s])) { - unsigned node = bi_uniform_word(I->src[s]); - unsigned new_node = old_to_new[node]; - I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1); - I->src[s].offset = new_node & 1; - } - } - } + /* Use new ordering throughout the program */ + bi_foreach_instr_global(ctx, I) { + bi_foreach_src(I, s) { + if (bi_is_uniform(I->src[s])) { + unsigned node = bi_uniform_word(I->src[s]); + unsigned new_node = old_to_new[node]; + I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1); + I->src[s].offset = new_node & 1; + } + } + } - /* Use new ordering for push */ - struct panfrost_ubo_push old = *push; - for (unsigned i = 0; i < pushed; ++i) - push->words[push_offset + i] = old.words[ordering[i]]; + /* Use new ordering for push */ + struct panfrost_ubo_push old = *push; + for (unsigned i = 0; i < pushed; ++i) + push->words[push_offset + i] = old.words[ordering[i]]; - push->count = push_offset + pushed; + push->count = push_offset + pushed; } diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c index da27a315cbd..7782c07ecd6 100644 --- a/src/panfrost/bifrost/bi_pack.c +++ b/src/panfrost/bifrost/bi_pack.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_quirks.h" +#include "compiler.h" /* This file contains the final passes of the compiler. Running after * scheduling and RA, the IR is now finalized, so we need to emit it to actual @@ -31,39 +31,38 @@ static uint64_t bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2) { - /* next_dependencies are the union of the dependencies of successors' - * dependencies */ + /* next_dependencies are the union of the dependencies of successors' + * dependencies */ - unsigned dependency_wait = next_1 ? next_1->dependencies : 0; - dependency_wait |= next_2 ? next_2->dependencies : 0; + unsigned dependency_wait = next_1 ? next_1->dependencies : 0; + dependency_wait |= next_2 ? next_2->dependencies : 0; - /* Signal barriers (slot #7) immediately. This is not optimal but good - * enough. Doing better requires extending the IR and scheduler. - */ - if (clause->message_type == BIFROST_MESSAGE_BARRIER) - dependency_wait |= BITFIELD_BIT(7); + /* Signal barriers (slot #7) immediately. This is not optimal but good + * enough. Doing better requires extending the IR and scheduler. + */ + if (clause->message_type == BIFROST_MESSAGE_BARRIER) + dependency_wait |= BITFIELD_BIT(7); - bool staging_barrier = next_1 ? next_1->staging_barrier : false; - staging_barrier |= next_2 ? next_2->staging_barrier : 0; + bool staging_barrier = next_1 ? next_1->staging_barrier : false; + staging_barrier |= next_2 ? next_2->staging_barrier : 0; - struct bifrost_header header = { - .flow_control = - (next_1 == NULL && next_2 == NULL) ? - BIFROST_FLOW_END : clause->flow_control, - .terminate_discarded_threads = clause->td, - .next_clause_prefetch = clause->next_clause_prefetch && next_1, - .staging_barrier = staging_barrier, - .staging_register = clause->staging_register, - .dependency_wait = dependency_wait, - .dependency_slot = clause->scoreboard_id, - .message_type = clause->message_type, - .next_message_type = next_1 ? next_1->message_type : 0, - .flush_to_zero = clause->ftz ? BIFROST_FTZ_ALWAYS : BIFROST_FTZ_DISABLE, - }; + struct bifrost_header header = { + .flow_control = (next_1 == NULL && next_2 == NULL) ? BIFROST_FLOW_END + : clause->flow_control, + .terminate_discarded_threads = clause->td, + .next_clause_prefetch = clause->next_clause_prefetch && next_1, + .staging_barrier = staging_barrier, + .staging_register = clause->staging_register, + .dependency_wait = dependency_wait, + .dependency_slot = clause->scoreboard_id, + .message_type = clause->message_type, + .next_message_type = next_1 ? next_1->message_type : 0, + .flush_to_zero = clause->ftz ? BIFROST_FTZ_ALWAYS : BIFROST_FTZ_DISABLE, + }; - uint64_t u = 0; - memcpy(&u, &header, sizeof(header)); - return u; + uint64_t u = 0; + memcpy(&u, &header, sizeof(header)); + return u; } /* Assigns a slot for reading, before anything is written */ @@ -71,205 +70,207 @@ bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2) static void bi_assign_slot_read(bi_registers *regs, bi_index src) { - /* We only assign for registers */ - if (src.type != BI_INDEX_REGISTER) - return; + /* We only assign for registers */ + if (src.type != BI_INDEX_REGISTER) + return; - /* Check if we already assigned the slot */ - for (unsigned i = 0; i <= 1; ++i) { - if (regs->slot[i] == src.value && regs->enabled[i]) - return; - } + /* Check if we already assigned the slot */ + for (unsigned i = 0; i <= 1; ++i) { + if (regs->slot[i] == src.value && regs->enabled[i]) + return; + } - if (regs->slot[2] == src.value && regs->slot23.slot2 == BIFROST_OP_READ) - return; + if (regs->slot[2] == src.value && regs->slot23.slot2 == BIFROST_OP_READ) + return; - /* Assign it now */ + /* Assign it now */ - for (unsigned i = 0; i <= 1; ++i) { - if (!regs->enabled[i]) { - regs->slot[i] = src.value; - regs->enabled[i] = true; - return; - } - } + for (unsigned i = 0; i <= 1; ++i) { + if (!regs->enabled[i]) { + regs->slot[i] = src.value; + regs->enabled[i] = true; + return; + } + } - if (!regs->slot23.slot3) { - regs->slot[2] = src.value; - regs->slot23.slot2 = BIFROST_OP_READ; - return; - } + if (!regs->slot23.slot3) { + regs->slot[2] = src.value; + regs->slot23.slot2 = BIFROST_OP_READ; + return; + } - bi_print_slots(regs, stderr); - unreachable("Failed to find a free slot for src"); + bi_print_slots(regs, stderr); + unreachable("Failed to find a free slot for src"); } static bi_registers bi_assign_slots(bi_tuple *now, bi_tuple *prev) { - /* We assign slots for the main register mechanism. Special ops - * use the data registers, which has its own mechanism entirely - * and thus gets skipped over here. */ + /* We assign slots for the main register mechanism. Special ops + * use the data registers, which has its own mechanism entirely + * and thus gets skipped over here. */ - bool read_dreg = now->add && bi_opcode_props[now->add->op].sr_read; - bool write_dreg = prev->add && bi_opcode_props[prev->add->op].sr_write; + bool read_dreg = now->add && bi_opcode_props[now->add->op].sr_read; + bool write_dreg = prev->add && bi_opcode_props[prev->add->op].sr_write; - /* First, assign reads */ + /* First, assign reads */ - if (now->fma) - bi_foreach_src(now->fma, src) - bi_assign_slot_read(&now->regs, (now->fma)->src[src]); + if (now->fma) + bi_foreach_src(now->fma, src) + bi_assign_slot_read(&now->regs, (now->fma)->src[src]); - if (now->add) { - bi_foreach_src(now->add, src) { - /* This is not a real source, we shouldn't assign a - * slot for it. - */ - if (now->add->op == BI_OPCODE_BLEND && src == 4) - continue; + if (now->add) { + bi_foreach_src(now->add, src) { + /* This is not a real source, we shouldn't assign a + * slot for it. + */ + if (now->add->op == BI_OPCODE_BLEND && src == 4) + continue; - if (!(src == 0 && read_dreg)) - bi_assign_slot_read(&now->regs, (now->add)->src[src]); - } - } + if (!(src == 0 && read_dreg)) + bi_assign_slot_read(&now->regs, (now->add)->src[src]); + } + } - /* Next, assign writes. Staging writes are assigned separately, but - * +ATEST wants its destination written to both a staging register - * _and_ a regular write, because it may not generate a message */ + /* Next, assign writes. Staging writes are assigned separately, but + * +ATEST wants its destination written to both a staging register + * _and_ a regular write, because it may not generate a message */ - if (prev->add && prev->add->nr_dests && (!write_dreg || prev->add->op == BI_OPCODE_ATEST)) { - bi_index idx = prev->add->dest[0]; + if (prev->add && prev->add->nr_dests && + (!write_dreg || prev->add->op == BI_OPCODE_ATEST)) { + bi_index idx = prev->add->dest[0]; - if (idx.type == BI_INDEX_REGISTER) { - now->regs.slot[3] = idx.value; - now->regs.slot23.slot3 = BIFROST_OP_WRITE; - } - } + if (idx.type == BI_INDEX_REGISTER) { + now->regs.slot[3] = idx.value; + now->regs.slot23.slot3 = BIFROST_OP_WRITE; + } + } - if (prev->fma && prev->fma->nr_dests) { - bi_index idx = prev->fma->dest[0]; + if (prev->fma && prev->fma->nr_dests) { + bi_index idx = prev->fma->dest[0]; - if (idx.type == BI_INDEX_REGISTER) { - if (now->regs.slot23.slot3) { - /* Scheduler constraint: cannot read 3 and write 2 */ - assert(!now->regs.slot23.slot2); - now->regs.slot[2] = idx.value; - now->regs.slot23.slot2 = BIFROST_OP_WRITE; - } else { - now->regs.slot[3] = idx.value; - now->regs.slot23.slot3 = BIFROST_OP_WRITE; - now->regs.slot23.slot3_fma = true; - } - } - } + if (idx.type == BI_INDEX_REGISTER) { + if (now->regs.slot23.slot3) { + /* Scheduler constraint: cannot read 3 and write 2 */ + assert(!now->regs.slot23.slot2); + now->regs.slot[2] = idx.value; + now->regs.slot23.slot2 = BIFROST_OP_WRITE; + } else { + now->regs.slot[3] = idx.value; + now->regs.slot23.slot3 = BIFROST_OP_WRITE; + now->regs.slot23.slot3_fma = true; + } + } + } - return now->regs; + return now->regs; } static enum bifrost_reg_mode bi_pack_register_mode(bi_registers r) { - /* Handle idle as a special case */ - if (!(r.slot23.slot2 | r.slot23.slot3)) - return r.first_instruction ? BIFROST_IDLE_1 : BIFROST_IDLE; + /* Handle idle as a special case */ + if (!(r.slot23.slot2 | r.slot23.slot3)) + return r.first_instruction ? BIFROST_IDLE_1 : BIFROST_IDLE; - /* Otherwise, use the LUT */ - for (unsigned i = 0; i < ARRAY_SIZE(bifrost_reg_ctrl_lut); ++i) { - if (memcmp(bifrost_reg_ctrl_lut + i, &r.slot23, sizeof(r.slot23)) == 0) - return i; - } + /* Otherwise, use the LUT */ + for (unsigned i = 0; i < ARRAY_SIZE(bifrost_reg_ctrl_lut); ++i) { + if (memcmp(bifrost_reg_ctrl_lut + i, &r.slot23, sizeof(r.slot23)) == 0) + return i; + } - bi_print_slots(&r, stderr); - unreachable("Invalid slot assignment"); + bi_print_slots(&r, stderr); + unreachable("Invalid slot assignment"); } static uint64_t bi_pack_registers(bi_registers regs) { - enum bifrost_reg_mode mode = bi_pack_register_mode(regs); - struct bifrost_regs s = { 0 }; - uint64_t packed = 0; + enum bifrost_reg_mode mode = bi_pack_register_mode(regs); + struct bifrost_regs s = {0}; + uint64_t packed = 0; - /* Need to pack 5-bit mode as a 4-bit field. The decoder moves bit 3 to bit 4 for - * first instruction and adds 16 when reg 2 == reg 3 */ + /* Need to pack 5-bit mode as a 4-bit field. The decoder moves bit 3 to bit 4 + * for first instruction and adds 16 when reg 2 == reg 3 */ - unsigned ctrl; - bool r2_equals_r3 = false; + unsigned ctrl; + bool r2_equals_r3 = false; - if (regs.first_instruction) { - /* Bit 3 implicitly must be clear for first instructions. - * The affected patterns all write both ADD/FMA, but that - * is forbidden for the last instruction (whose writes are - * encoded by the first), so this does not add additional - * encoding constraints */ - assert(!(mode & 0x8)); + if (regs.first_instruction) { + /* Bit 3 implicitly must be clear for first instructions. + * The affected patterns all write both ADD/FMA, but that + * is forbidden for the last instruction (whose writes are + * encoded by the first), so this does not add additional + * encoding constraints */ + assert(!(mode & 0x8)); - /* Move bit 4 to bit 3, since bit 3 is clear */ - ctrl = (mode & 0x7) | ((mode & 0x10) >> 1); + /* Move bit 4 to bit 3, since bit 3 is clear */ + ctrl = (mode & 0x7) | ((mode & 0x10) >> 1); - /* If we can let r2 equal r3, we have to or the hardware raises - * INSTR_INVALID_ENC (it's unclear why). */ - if (!(regs.slot23.slot2 && regs.slot23.slot3)) - r2_equals_r3 = true; - } else { - /* We force r2=r3 or not for the upper bit */ - ctrl = (mode & 0xF); - r2_equals_r3 = (mode & 0x10); - } + /* If we can let r2 equal r3, we have to or the hardware raises + * INSTR_INVALID_ENC (it's unclear why). */ + if (!(regs.slot23.slot2 && regs.slot23.slot3)) + r2_equals_r3 = true; + } else { + /* We force r2=r3 or not for the upper bit */ + ctrl = (mode & 0xF); + r2_equals_r3 = (mode & 0x10); + } - if (regs.enabled[1]) { - /* Gotta save that bit!~ Required by the 63-x trick */ - assert(regs.slot[1] > regs.slot[0]); - assert(regs.enabled[0]); + if (regs.enabled[1]) { + /* Gotta save that bit!~ Required by the 63-x trick */ + assert(regs.slot[1] > regs.slot[0]); + assert(regs.enabled[0]); - /* Do the 63-x trick, see docs/disasm */ - if (regs.slot[0] > 31) { - regs.slot[0] = 63 - regs.slot[0]; - regs.slot[1] = 63 - regs.slot[1]; - } + /* Do the 63-x trick, see docs/disasm */ + if (regs.slot[0] > 31) { + regs.slot[0] = 63 - regs.slot[0]; + regs.slot[1] = 63 - regs.slot[1]; + } - assert(regs.slot[0] <= 31); - assert(regs.slot[1] <= 63); + assert(regs.slot[0] <= 31); + assert(regs.slot[1] <= 63); - s.ctrl = ctrl; - s.reg1 = regs.slot[1]; - s.reg0 = regs.slot[0]; - } else { - /* slot 1 disabled, so set to zero and use slot 1 for ctrl */ - s.ctrl = 0; - s.reg1 = ctrl << 2; + s.ctrl = ctrl; + s.reg1 = regs.slot[1]; + s.reg0 = regs.slot[0]; + } else { + /* slot 1 disabled, so set to zero and use slot 1 for ctrl */ + s.ctrl = 0; + s.reg1 = ctrl << 2; - if (regs.enabled[0]) { - /* Bit 0 upper bit of slot 0 */ - s.reg1 |= (regs.slot[0] >> 5); + if (regs.enabled[0]) { + /* Bit 0 upper bit of slot 0 */ + s.reg1 |= (regs.slot[0] >> 5); - /* Rest of slot 0 in usual spot */ - s.reg0 = (regs.slot[0] & 0b11111); - } else { - /* Bit 1 set if slot 0 also disabled */ - s.reg1 |= (1 << 1); - } - } + /* Rest of slot 0 in usual spot */ + s.reg0 = (regs.slot[0] & 0b11111); + } else { + /* Bit 1 set if slot 0 also disabled */ + s.reg1 |= (1 << 1); + } + } - /* Force r2 =/!= r3 as needed */ - if (r2_equals_r3) { - assert(regs.slot[3] == regs.slot[2] || !(regs.slot23.slot2 && regs.slot23.slot3)); + /* Force r2 =/!= r3 as needed */ + if (r2_equals_r3) { + assert(regs.slot[3] == regs.slot[2] || + !(regs.slot23.slot2 && regs.slot23.slot3)); - if (regs.slot23.slot2) - regs.slot[3] = regs.slot[2]; - else - regs.slot[2] = regs.slot[3]; - } else if (!regs.first_instruction) { - /* Enforced by the encoding anyway */ - assert(regs.slot[2] != regs.slot[3]); - } + if (regs.slot23.slot2) + regs.slot[3] = regs.slot[2]; + else + regs.slot[2] = regs.slot[3]; + } else if (!regs.first_instruction) { + /* Enforced by the encoding anyway */ + assert(regs.slot[2] != regs.slot[3]); + } - s.reg2 = regs.slot[2]; - s.reg3 = regs.slot[3]; - s.fau_idx = regs.fau_idx; + s.reg2 = regs.slot[2]; + s.reg3 = regs.slot[3]; + s.fau_idx = regs.fau_idx; - memcpy(&packed, &s, sizeof(s)); - return packed; + memcpy(&packed, &s, sizeof(s)); + return packed; } /* We must ensure slot 1 > slot 0 for the 63-x trick to function, so we fix @@ -278,94 +279,92 @@ bi_pack_registers(bi_registers regs) static void bi_flip_slots(bi_registers *regs) { - if (regs->enabled[0] && regs->enabled[1] && regs->slot[1] < regs->slot[0]) { - unsigned temp = regs->slot[0]; - regs->slot[0] = regs->slot[1]; - regs->slot[1] = temp; - } - + if (regs->enabled[0] && regs->enabled[1] && regs->slot[1] < regs->slot[0]) { + unsigned temp = regs->slot[0]; + regs->slot[0] = regs->slot[1]; + regs->slot[1] = temp; + } } static inline enum bifrost_packed_src bi_get_src_slot(bi_registers *regs, unsigned reg) { - if (regs->slot[0] == reg && regs->enabled[0]) - return BIFROST_SRC_PORT0; - else if (regs->slot[1] == reg && regs->enabled[1]) - return BIFROST_SRC_PORT1; - else if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ) - return BIFROST_SRC_PORT2; - else - unreachable("Tried to access register with no port"); + if (regs->slot[0] == reg && regs->enabled[0]) + return BIFROST_SRC_PORT0; + else if (regs->slot[1] == reg && regs->enabled[1]) + return BIFROST_SRC_PORT1; + else if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ) + return BIFROST_SRC_PORT2; + else + unreachable("Tried to access register with no port"); } static inline enum bifrost_packed_src bi_get_src_new(bi_instr *ins, bi_registers *regs, unsigned s) { - if (!ins || s >= ins->nr_srcs) - return 0; + if (!ins || s >= ins->nr_srcs) + return 0; - bi_index src = ins->src[s]; + bi_index src = ins->src[s]; - if (src.type == BI_INDEX_REGISTER) - return bi_get_src_slot(regs, src.value); - else if (src.type == BI_INDEX_PASS) - return src.value; - else { - /* TODO make safer */ - return BIFROST_SRC_STAGE; - } + if (src.type == BI_INDEX_REGISTER) + return bi_get_src_slot(regs, src.value); + else if (src.type == BI_INDEX_PASS) + return src.value; + else { + /* TODO make safer */ + return BIFROST_SRC_STAGE; + } } static struct bi_packed_tuple -bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tuple, gl_shader_stage stage) +bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, + bool first_tuple, gl_shader_stage stage) { - bi_assign_slots(tuple, prev); - tuple->regs.fau_idx = tuple->fau_idx; - tuple->regs.first_instruction = first_tuple; + bi_assign_slots(tuple, prev); + tuple->regs.fau_idx = tuple->fau_idx; + tuple->regs.first_instruction = first_tuple; - bi_flip_slots(&tuple->regs); + bi_flip_slots(&tuple->regs); - bool sr_read = tuple->add && - bi_opcode_props[(tuple->add)->op].sr_read; + bool sr_read = tuple->add && bi_opcode_props[(tuple->add)->op].sr_read; - uint64_t reg = bi_pack_registers(tuple->regs); - uint64_t fma = bi_pack_fma(tuple->fma, - bi_get_src_new(tuple->fma, &tuple->regs, 0), - bi_get_src_new(tuple->fma, &tuple->regs, 1), - bi_get_src_new(tuple->fma, &tuple->regs, 2), - bi_get_src_new(tuple->fma, &tuple->regs, 3)); + uint64_t reg = bi_pack_registers(tuple->regs); + uint64_t fma = + bi_pack_fma(tuple->fma, bi_get_src_new(tuple->fma, &tuple->regs, 0), + bi_get_src_new(tuple->fma, &tuple->regs, 1), + bi_get_src_new(tuple->fma, &tuple->regs, 2), + bi_get_src_new(tuple->fma, &tuple->regs, 3)); - uint64_t add = bi_pack_add(tuple->add, - bi_get_src_new(tuple->add, &tuple->regs, sr_read + 0), - bi_get_src_new(tuple->add, &tuple->regs, sr_read + 1), - bi_get_src_new(tuple->add, &tuple->regs, sr_read + 2), - 0); + uint64_t add = bi_pack_add( + tuple->add, bi_get_src_new(tuple->add, &tuple->regs, sr_read + 0), + bi_get_src_new(tuple->add, &tuple->regs, sr_read + 1), + bi_get_src_new(tuple->add, &tuple->regs, sr_read + 2), 0); - if (tuple->add) { - bi_instr *add = tuple->add; + if (tuple->add) { + bi_instr *add = tuple->add; - bool sr_write = bi_opcode_props[add->op].sr_write && - !bi_is_null(add->dest[0]); + bool sr_write = + bi_opcode_props[add->op].sr_write && !bi_is_null(add->dest[0]); - if (sr_read && !bi_is_null(add->src[0])) { - assert(add->src[0].type == BI_INDEX_REGISTER); - clause->staging_register = add->src[0].value; + if (sr_read && !bi_is_null(add->src[0])) { + assert(add->src[0].type == BI_INDEX_REGISTER); + clause->staging_register = add->src[0].value; - if (sr_write) - assert(bi_is_equiv(add->src[0], add->dest[0])); - } else if (sr_write) { - assert(add->dest[0].type == BI_INDEX_REGISTER); - clause->staging_register = add->dest[0].value; - } - } + if (sr_write) + assert(bi_is_equiv(add->src[0], add->dest[0])); + } else if (sr_write) { + assert(add->dest[0].type == BI_INDEX_REGISTER); + clause->staging_register = add->dest[0].value; + } + } - struct bi_packed_tuple packed = { - .lo = reg | (fma << 35) | ((add & 0b111111) << 58), - .hi = add >> 6, - }; + struct bi_packed_tuple packed = { + .lo = reg | (fma << 35) | ((add & 0b111111) << 58), + .hi = add >> 6, + }; - return packed; + return packed; } /* A block contains at most one PC-relative constant, from a terminal branch. @@ -378,357 +377,328 @@ bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tup static void bi_assign_branch_offset(bi_context *ctx, bi_block *block) { - if (list_is_empty(&block->clauses)) - return; + if (list_is_empty(&block->clauses)) + return; - bi_clause *clause = list_last_entry(&block->clauses, bi_clause, link); - bi_instr *br = bi_last_instr_in_clause(clause); + bi_clause *clause = list_last_entry(&block->clauses, bi_clause, link); + bi_instr *br = bi_last_instr_in_clause(clause); - if (!br->branch_target) - return; + if (!br->branch_target) + return; - /* Put it in the high place */ - int32_t qwords = bi_block_offset(ctx, clause, br->branch_target); - int32_t bytes = qwords * 16; + /* Put it in the high place */ + int32_t qwords = bi_block_offset(ctx, clause, br->branch_target); + int32_t bytes = qwords * 16; - /* Copy so we can toy with the sign without undefined behaviour */ - uint32_t raw = 0; - memcpy(&raw, &bytes, sizeof(raw)); + /* Copy so we can toy with the sign without undefined behaviour */ + uint32_t raw = 0; + memcpy(&raw, &bytes, sizeof(raw)); - /* Clear off top bits for A1/B1 bits */ - raw &= ~0xF0000000; + /* Clear off top bits for A1/B1 bits */ + raw &= ~0xF0000000; - /* Put in top 32-bits */ - assert(clause->pcrel_idx < 8); - clause->constants[clause->pcrel_idx] |= ((uint64_t) raw) << 32ull; + /* Put in top 32-bits */ + assert(clause->pcrel_idx < 8); + clause->constants[clause->pcrel_idx] |= ((uint64_t)raw) << 32ull; } static void -bi_pack_constants(unsigned tuple_count, uint64_t *constants, - unsigned word_idx, unsigned constant_words, bool ec0_packed, - struct util_dynarray *emission) +bi_pack_constants(unsigned tuple_count, uint64_t *constants, unsigned word_idx, + unsigned constant_words, bool ec0_packed, + struct util_dynarray *emission) { - unsigned index = (word_idx << 1) + ec0_packed; + unsigned index = (word_idx << 1) + ec0_packed; - /* Do more constants follow */ - bool more = (word_idx + 1) < constant_words; + /* Do more constants follow */ + bool more = (word_idx + 1) < constant_words; - /* Indexed first by tuple count and second by constant word number, - * indicates the position in the clause */ - unsigned pos_lookup[8][3] = { - { 0 }, - { 1 }, - { 3 }, - { 2, 5 }, - { 4, 8 }, - { 7, 11, 14 }, - { 6, 10, 13 }, - { 9, 12 }, - }; + /* Indexed first by tuple count and second by constant word number, + * indicates the position in the clause */ + unsigned pos_lookup[8][3] = { + {0}, {1}, {3}, {2, 5}, {4, 8}, {7, 11, 14}, {6, 10, 13}, {9, 12}, + }; - /* Compute the pos, and check everything is reasonable */ - assert((tuple_count - 1) < 8); - assert(word_idx < 3); - unsigned pos = pos_lookup[tuple_count - 1][word_idx]; - assert(pos != 0 || (tuple_count == 1 && word_idx == 0)); + /* Compute the pos, and check everything is reasonable */ + assert((tuple_count - 1) < 8); + assert(word_idx < 3); + unsigned pos = pos_lookup[tuple_count - 1][word_idx]; + assert(pos != 0 || (tuple_count == 1 && word_idx == 0)); - struct bifrost_fmt_constant quad = { - .pos = pos, - .tag = more ? BIFROST_FMTC_CONSTANTS : BIFROST_FMTC_FINAL, - .imm_1 = constants[index + 0] >> 4, - .imm_2 = constants[index + 1] >> 4, - }; + struct bifrost_fmt_constant quad = { + .pos = pos, + .tag = more ? BIFROST_FMTC_CONSTANTS : BIFROST_FMTC_FINAL, + .imm_1 = constants[index + 0] >> 4, + .imm_2 = constants[index + 1] >> 4, + }; - util_dynarray_append(emission, struct bifrost_fmt_constant, quad); + util_dynarray_append(emission, struct bifrost_fmt_constant, quad); } uint8_t bi_pack_literal(enum bi_clause_subword literal) { - assert(literal >= BI_CLAUSE_SUBWORD_LITERAL_0); - assert(literal <= BI_CLAUSE_SUBWORD_LITERAL_7); + assert(literal >= BI_CLAUSE_SUBWORD_LITERAL_0); + assert(literal <= BI_CLAUSE_SUBWORD_LITERAL_7); - return (literal - BI_CLAUSE_SUBWORD_LITERAL_0); + return (literal - BI_CLAUSE_SUBWORD_LITERAL_0); } static inline uint8_t -bi_clause_upper(unsigned val, - struct bi_packed_tuple *tuples, +bi_clause_upper(unsigned val, struct bi_packed_tuple *tuples, ASSERTED unsigned tuple_count) { - assert(val < tuple_count); + assert(val < tuple_count); - /* top 3-bits of 78-bits is tuple >> 75 == (tuple >> 64) >> 11 */ - struct bi_packed_tuple tuple = tuples[val]; - return (tuple.hi >> 11); + /* top 3-bits of 78-bits is tuple >> 75 == (tuple >> 64) >> 11 */ + struct bi_packed_tuple tuple = tuples[val]; + return (tuple.hi >> 11); } uint8_t -bi_pack_upper(enum bi_clause_subword upper, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count) +bi_pack_upper(enum bi_clause_subword upper, struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count) { - assert(upper >= BI_CLAUSE_SUBWORD_UPPER_0); - assert(upper <= BI_CLAUSE_SUBWORD_UPPER_7); + assert(upper >= BI_CLAUSE_SUBWORD_UPPER_0); + assert(upper <= BI_CLAUSE_SUBWORD_UPPER_7); - return bi_clause_upper(upper - BI_CLAUSE_SUBWORD_UPPER_0, tuples, - tuple_count); + return bi_clause_upper(upper - BI_CLAUSE_SUBWORD_UPPER_0, tuples, + tuple_count); } uint64_t -bi_pack_tuple_bits(enum bi_clause_subword idx, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count, - unsigned offset, unsigned nbits) +bi_pack_tuple_bits(enum bi_clause_subword idx, struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count, unsigned offset, + unsigned nbits) { - assert(idx >= BI_CLAUSE_SUBWORD_TUPLE_0); - assert(idx <= BI_CLAUSE_SUBWORD_TUPLE_7); + assert(idx >= BI_CLAUSE_SUBWORD_TUPLE_0); + assert(idx <= BI_CLAUSE_SUBWORD_TUPLE_7); - unsigned val = (idx - BI_CLAUSE_SUBWORD_TUPLE_0); - assert(val < tuple_count); + unsigned val = (idx - BI_CLAUSE_SUBWORD_TUPLE_0); + assert(val < tuple_count); - struct bi_packed_tuple tuple = tuples[val]; + struct bi_packed_tuple tuple = tuples[val]; - assert(offset + nbits < 78); - assert(nbits <= 64); + assert(offset + nbits < 78); + assert(nbits <= 64); - /* (X >> start) & m - * = (((hi << 64) | lo) >> start) & m - * = (((hi << 64) >> start) | (lo >> start)) & m - * = { ((hi << (64 - start)) | (lo >> start)) & m if start <= 64 - * { ((hi >> (start - 64)) | (lo >> start)) & m if start >= 64 - * = { ((hi << (64 - start)) & m) | ((lo >> start) & m) if start <= 64 - * { ((hi >> (start - 64)) & m) | ((lo >> start) & m) if start >= 64 - * - * By setting m = 2^64 - 1, we justify doing the respective shifts as - * 64-bit integers. Zero special cased to avoid undefined behaviour. - */ + /* (X >> start) & m + * = (((hi << 64) | lo) >> start) & m + * = (((hi << 64) >> start) | (lo >> start)) & m + * = { ((hi << (64 - start)) | (lo >> start)) & m if start <= 64 + * { ((hi >> (start - 64)) | (lo >> start)) & m if start >= 64 + * = { ((hi << (64 - start)) & m) | ((lo >> start) & m) if start <= 64 + * { ((hi >> (start - 64)) & m) | ((lo >> start) & m) if start >= 64 + * + * By setting m = 2^64 - 1, we justify doing the respective shifts as + * 64-bit integers. Zero special cased to avoid undefined behaviour. + */ - uint64_t lo = (tuple.lo >> offset); - uint64_t hi = (offset == 0) ? 0 - : (offset > 64) ? (tuple.hi >> (offset - 64)) - : (tuple.hi << (64 - offset)); + uint64_t lo = (tuple.lo >> offset); + uint64_t hi = (offset == 0) ? 0 + : (offset > 64) ? (tuple.hi >> (offset - 64)) + : (tuple.hi << (64 - offset)); - return (lo | hi) & ((1ULL << nbits) - 1); + return (lo | hi) & ((1ULL << nbits) - 1); } static inline uint16_t -bi_pack_lu(enum bi_clause_subword word, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count) +bi_pack_lu(enum bi_clause_subword word, struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count) { - return (word >= BI_CLAUSE_SUBWORD_UPPER_0) ? - bi_pack_upper(word, tuples, tuple_count) : - bi_pack_literal(word); + return (word >= BI_CLAUSE_SUBWORD_UPPER_0) + ? bi_pack_upper(word, tuples, tuple_count) + : bi_pack_literal(word); } uint8_t -bi_pack_sync(enum bi_clause_subword t1, - enum bi_clause_subword t2, - enum bi_clause_subword t3, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count, - bool z) +bi_pack_sync(enum bi_clause_subword t1, enum bi_clause_subword t2, + enum bi_clause_subword t3, struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count, bool z) { - uint8_t sync = - (bi_pack_lu(t3, tuples, tuple_count) << 0) | - (bi_pack_lu(t2, tuples, tuple_count) << 3); + uint8_t sync = (bi_pack_lu(t3, tuples, tuple_count) << 0) | + (bi_pack_lu(t2, tuples, tuple_count) << 3); - if (t1 == BI_CLAUSE_SUBWORD_Z) - sync |= z << 6; - else - sync |= bi_pack_literal(t1) << 6; + if (t1 == BI_CLAUSE_SUBWORD_Z) + sync |= z << 6; + else + sync |= bi_pack_literal(t1) << 6; - return sync; + return sync; } static inline uint64_t -bi_pack_t_ec(enum bi_clause_subword word, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count, - uint64_t ec0) +bi_pack_t_ec(enum bi_clause_subword word, struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count, uint64_t ec0) { - if (word == BI_CLAUSE_SUBWORD_CONSTANT) - return ec0; - else - return bi_pack_tuple_bits(word, tuples, tuple_count, 0, 60); + if (word == BI_CLAUSE_SUBWORD_CONSTANT) + return ec0; + else + return bi_pack_tuple_bits(word, tuples, tuple_count, 0, 60); } static uint32_t -bi_pack_subwords_56(enum bi_clause_subword t, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count, - uint64_t header, uint64_t ec0, - unsigned tuple_subword) +bi_pack_subwords_56(enum bi_clause_subword t, struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count, uint64_t header, + uint64_t ec0, unsigned tuple_subword) { - switch (t) { - case BI_CLAUSE_SUBWORD_HEADER: - return (header & ((1 << 30) - 1)); - case BI_CLAUSE_SUBWORD_RESERVED: - return 0; - case BI_CLAUSE_SUBWORD_CONSTANT: - return (ec0 >> 15) & ((1 << 30) - 1); - default: - return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 30); - } + switch (t) { + case BI_CLAUSE_SUBWORD_HEADER: + return (header & ((1 << 30) - 1)); + case BI_CLAUSE_SUBWORD_RESERVED: + return 0; + case BI_CLAUSE_SUBWORD_CONSTANT: + return (ec0 >> 15) & ((1 << 30) - 1); + default: + return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 30); + } } static uint16_t bi_pack_subword(enum bi_clause_subword t, unsigned format, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count, + struct bi_packed_tuple *tuples, ASSERTED unsigned tuple_count, uint64_t header, uint64_t ec0, unsigned m0, unsigned tuple_subword) { - switch (t) { - case BI_CLAUSE_SUBWORD_HEADER: - return header >> 30; - case BI_CLAUSE_SUBWORD_M: - return m0; - case BI_CLAUSE_SUBWORD_CONSTANT: - return (format == 5 || format == 10) ? - (ec0 & ((1 << 15) - 1)) : - (ec0 >> (15 + 30)); - case BI_CLAUSE_SUBWORD_UPPER_23: - return (bi_clause_upper(2, tuples, tuple_count) << 12) | - (bi_clause_upper(3, tuples, tuple_count) << 9); - case BI_CLAUSE_SUBWORD_UPPER_56: - return (bi_clause_upper(5, tuples, tuple_count) << 12) | - (bi_clause_upper(6, tuples, tuple_count) << 9); - case BI_CLAUSE_SUBWORD_UPPER_0 ... BI_CLAUSE_SUBWORD_UPPER_7: - return bi_pack_upper(t, tuples, tuple_count) << 12; - default: - return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 15); - } + switch (t) { + case BI_CLAUSE_SUBWORD_HEADER: + return header >> 30; + case BI_CLAUSE_SUBWORD_M: + return m0; + case BI_CLAUSE_SUBWORD_CONSTANT: + return (format == 5 || format == 10) ? (ec0 & ((1 << 15) - 1)) + : (ec0 >> (15 + 30)); + case BI_CLAUSE_SUBWORD_UPPER_23: + return (bi_clause_upper(2, tuples, tuple_count) << 12) | + (bi_clause_upper(3, tuples, tuple_count) << 9); + case BI_CLAUSE_SUBWORD_UPPER_56: + return (bi_clause_upper(5, tuples, tuple_count) << 12) | + (bi_clause_upper(6, tuples, tuple_count) << 9); + case BI_CLAUSE_SUBWORD_UPPER_0 ... BI_CLAUSE_SUBWORD_UPPER_7: + return bi_pack_upper(t, tuples, tuple_count) << 12; + default: + return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 15); + } } /* EC0 is 60-bits (bottom 4 already shifted off) */ void -bi_pack_format(struct util_dynarray *emission, - unsigned index, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count, - uint64_t header, uint64_t ec0, - unsigned m0, bool z) +bi_pack_format(struct util_dynarray *emission, unsigned index, + struct bi_packed_tuple *tuples, ASSERTED unsigned tuple_count, + uint64_t header, uint64_t ec0, unsigned m0, bool z) { - struct bi_clause_format format = bi_clause_formats[index]; + struct bi_clause_format format = bi_clause_formats[index]; - uint8_t sync = bi_pack_sync(format.tag_1, format.tag_2, format.tag_3, - tuples, tuple_count, z); + uint8_t sync = bi_pack_sync(format.tag_1, format.tag_2, format.tag_3, tuples, + tuple_count, z); - uint64_t s0_s3 = bi_pack_t_ec(format.s0_s3, tuples, tuple_count, ec0); + uint64_t s0_s3 = bi_pack_t_ec(format.s0_s3, tuples, tuple_count, ec0); - uint16_t s4 = bi_pack_subword(format.s4, format.format, tuples, tuple_count, header, ec0, m0, 4); + uint16_t s4 = bi_pack_subword(format.s4, format.format, tuples, tuple_count, + header, ec0, m0, 4); - uint32_t s5_s6 = bi_pack_subwords_56(format.s5_s6, - tuples, tuple_count, header, ec0, - (format.format == 2 || format.format == 7) ? 0 : 3); + uint32_t s5_s6 = + bi_pack_subwords_56(format.s5_s6, tuples, tuple_count, header, ec0, + (format.format == 2 || format.format == 7) ? 0 : 3); - uint64_t s7 = bi_pack_subword(format.s7, format.format, tuples, tuple_count, header, ec0, m0, 2); + uint64_t s7 = bi_pack_subword(format.s7, format.format, tuples, tuple_count, + header, ec0, m0, 2); - /* Now that subwords are packed, split into 64-bit halves and emit */ - uint64_t lo = sync | ((s0_s3 & ((1ull << 56) - 1)) << 8); - uint64_t hi = (s0_s3 >> 56) | ((uint64_t) s4 << 4) | ((uint64_t) s5_s6 << 19) | ((uint64_t) s7 << 49); + /* Now that subwords are packed, split into 64-bit halves and emit */ + uint64_t lo = sync | ((s0_s3 & ((1ull << 56) - 1)) << 8); + uint64_t hi = (s0_s3 >> 56) | ((uint64_t)s4 << 4) | ((uint64_t)s5_s6 << 19) | + ((uint64_t)s7 << 49); - util_dynarray_append(emission, uint64_t, lo); - util_dynarray_append(emission, uint64_t, hi); + util_dynarray_append(emission, uint64_t, lo); + util_dynarray_append(emission, uint64_t, hi); } static void -bi_pack_clause(bi_context *ctx, bi_clause *clause, - bi_clause *next_1, bi_clause *next_2, - struct util_dynarray *emission, gl_shader_stage stage) +bi_pack_clause(bi_context *ctx, bi_clause *clause, bi_clause *next_1, + bi_clause *next_2, struct util_dynarray *emission, + gl_shader_stage stage) { - struct bi_packed_tuple ins[8] = { 0 }; + struct bi_packed_tuple ins[8] = {0}; - for (unsigned i = 0; i < clause->tuple_count; ++i) { - unsigned prev = ((i == 0) ? clause->tuple_count : i) - 1; - ins[i] = bi_pack_tuple(clause, &clause->tuples[i], - &clause->tuples[prev], i == 0, stage); + for (unsigned i = 0; i < clause->tuple_count; ++i) { + unsigned prev = ((i == 0) ? clause->tuple_count : i) - 1; + ins[i] = bi_pack_tuple(clause, &clause->tuples[i], &clause->tuples[prev], + i == 0, stage); - bi_instr *add = clause->tuples[i].add; + bi_instr *add = clause->tuples[i].add; - /* Different GPUs support different forms of the CLPER.i32 - * instruction. Check we use the right one for the target. - */ - if (add && add->op == BI_OPCODE_CLPER_OLD_I32) - assert(ctx->quirks & BIFROST_LIMITED_CLPER); - else if (add && add->op == BI_OPCODE_CLPER_I32) - assert(!(ctx->quirks & BIFROST_LIMITED_CLPER)); - } + /* Different GPUs support different forms of the CLPER.i32 + * instruction. Check we use the right one for the target. + */ + if (add && add->op == BI_OPCODE_CLPER_OLD_I32) + assert(ctx->quirks & BIFROST_LIMITED_CLPER); + else if (add && add->op == BI_OPCODE_CLPER_I32) + assert(!(ctx->quirks & BIFROST_LIMITED_CLPER)); + } - bool ec0_packed = bi_ec0_packed(clause->tuple_count); + bool ec0_packed = bi_ec0_packed(clause->tuple_count); - if (ec0_packed) - clause->constant_count = MAX2(clause->constant_count, 1); + if (ec0_packed) + clause->constant_count = MAX2(clause->constant_count, 1); - unsigned constant_quads = - DIV_ROUND_UP(clause->constant_count - (ec0_packed ? 1 : 0), 2); + unsigned constant_quads = + DIV_ROUND_UP(clause->constant_count - (ec0_packed ? 1 : 0), 2); - uint64_t header = bi_pack_header(clause, next_1, next_2); - uint64_t ec0 = (clause->constants[0] >> 4); - unsigned m0 = (clause->pcrel_idx == 0) ? 4 : 0; + uint64_t header = bi_pack_header(clause, next_1, next_2); + uint64_t ec0 = (clause->constants[0] >> 4); + unsigned m0 = (clause->pcrel_idx == 0) ? 4 : 0; - unsigned counts[8] = { - 1, 2, 3, 3, 4, 5, 5, 6, - }; + unsigned counts[8] = { + 1, 2, 3, 3, 4, 5, 5, 6, + }; - unsigned indices[8][6] = { - { 1 }, - { 0, 2 }, - { 0, 3, 4 }, - { 0, 3, 6 }, - { 0, 3, 7, 8 }, - { 0, 3, 5, 9, 10 }, - { 0, 3, 5, 9, 11 }, - { 0, 3, 5, 9, 12, 13 }, - }; + unsigned indices[8][6] = { + {1}, {0, 2}, {0, 3, 4}, {0, 3, 6}, + {0, 3, 7, 8}, {0, 3, 5, 9, 10}, {0, 3, 5, 9, 11}, {0, 3, 5, 9, 12, 13}, + }; - unsigned count = counts[clause->tuple_count - 1]; + unsigned count = counts[clause->tuple_count - 1]; - for (unsigned pos = 0; pos < count; ++pos) { - ASSERTED unsigned idx = indices[clause->tuple_count - 1][pos]; - assert(bi_clause_formats[idx].pos == pos); - assert((bi_clause_formats[idx].tag_1 == BI_CLAUSE_SUBWORD_Z) == - (pos == count - 1)); + for (unsigned pos = 0; pos < count; ++pos) { + ASSERTED unsigned idx = indices[clause->tuple_count - 1][pos]; + assert(bi_clause_formats[idx].pos == pos); + assert((bi_clause_formats[idx].tag_1 == BI_CLAUSE_SUBWORD_Z) == + (pos == count - 1)); - /* Whether to end the clause immediately after the last tuple */ - bool z = (constant_quads == 0); + /* Whether to end the clause immediately after the last tuple */ + bool z = (constant_quads == 0); - bi_pack_format(emission, indices[clause->tuple_count - 1][pos], - ins, clause->tuple_count, header, ec0, m0, - z); - } + bi_pack_format(emission, indices[clause->tuple_count - 1][pos], ins, + clause->tuple_count, header, ec0, m0, z); + } - /* Pack the remaining constants */ + /* Pack the remaining constants */ - for (unsigned pos = 0; pos < constant_quads; ++pos) { - bi_pack_constants(clause->tuple_count, clause->constants, - pos, constant_quads, ec0_packed, emission); - } + for (unsigned pos = 0; pos < constant_quads; ++pos) { + bi_pack_constants(clause->tuple_count, clause->constants, pos, + constant_quads, ec0_packed, emission); + } } static void bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission, const bi_clause *clause) { - /* No need to collect return addresses when we're in a blend shader. */ - if (ctx->inputs->is_blend) - return; + /* No need to collect return addresses when we're in a blend shader. */ + if (ctx->inputs->is_blend) + return; - const bi_tuple *tuple = &clause->tuples[clause->tuple_count - 1]; - const bi_instr *ins = tuple->add; + const bi_tuple *tuple = &clause->tuples[clause->tuple_count - 1]; + const bi_instr *ins = tuple->add; - if (!ins || ins->op != BI_OPCODE_BLEND) - return; + if (!ins || ins->op != BI_OPCODE_BLEND) + return; - - unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0; - assert(loc < ARRAY_SIZE(ctx->info.bifrost->blend)); - assert(!ctx->info.bifrost->blend[loc].return_offset); - ctx->info.bifrost->blend[loc].return_offset = - util_dynarray_num_elements(emission, uint8_t); - assert(!(ctx->info.bifrost->blend[loc].return_offset & 0x7)); + unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0; + assert(loc < ARRAY_SIZE(ctx->info.bifrost->blend)); + assert(!ctx->info.bifrost->blend[loc].return_offset); + ctx->info.bifrost->blend[loc].return_offset = + util_dynarray_num_elements(emission, uint8_t); + assert(!(ctx->info.bifrost->blend[loc].return_offset & 0x7)); } /* @@ -740,50 +710,49 @@ bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission, static void bi_lower_texc_dual(bi_context *ctx) { - bi_foreach_instr_global(ctx, I) { - if (I->op == BI_OPCODE_TEXC_DUAL) { - /* In hardware, TEXC has 1 destination */ - I->op = BI_OPCODE_TEXC; - bi_drop_dests(I, 1); - } - } + bi_foreach_instr_global(ctx, I) { + if (I->op == BI_OPCODE_TEXC_DUAL) { + /* In hardware, TEXC has 1 destination */ + I->op = BI_OPCODE_TEXC; + bi_drop_dests(I, 1); + } + } } unsigned bi_pack(bi_context *ctx, struct util_dynarray *emission) { - unsigned previous_size = emission->size; + unsigned previous_size = emission->size; - bi_lower_texc_dual(ctx); + bi_lower_texc_dual(ctx); - bi_foreach_block(ctx, block) { - bi_assign_branch_offset(ctx, block); + bi_foreach_block(ctx, block) { + bi_assign_branch_offset(ctx, block); - bi_foreach_clause_in_block(block, clause) { - bool is_last = (clause->link.next == &block->clauses); + bi_foreach_clause_in_block(block, clause) { + bool is_last = (clause->link.next == &block->clauses); - /* Get the succeeding clauses, either two successors of - * the block for the last clause in the block or just - * the next clause within the block */ + /* Get the succeeding clauses, either two successors of + * the block for the last clause in the block or just + * the next clause within the block */ - bi_clause *next = NULL, *next_2 = NULL; + bi_clause *next = NULL, *next_2 = NULL; - if (is_last) { - next = bi_next_clause(ctx, block->successors[0], NULL); - next_2 = bi_next_clause(ctx, block->successors[1], NULL); - } else { - next = bi_next_clause(ctx, block, clause); - } + if (is_last) { + next = bi_next_clause(ctx, block->successors[0], NULL); + next_2 = bi_next_clause(ctx, block->successors[1], NULL); + } else { + next = bi_next_clause(ctx, block, clause); + } + previous_size = emission->size; - previous_size = emission->size; + bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage); - bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage); + if (!is_last) + bi_collect_blend_ret_addr(ctx, emission, clause); + } + } - if (!is_last) - bi_collect_blend_ret_addr(ctx, emission, clause); - } - } - - return emission->size - previous_size; + return emission->size - previous_size; } diff --git a/src/panfrost/bifrost/bi_pressure_schedule.c b/src/panfrost/bifrost/bi_pressure_schedule.c index a4748e8bed6..1fa3134fecf 100644 --- a/src/panfrost/bifrost/bi_pressure_schedule.c +++ b/src/panfrost/bifrost/bi_pressure_schedule.c @@ -26,149 +26,148 @@ /* Bottom-up local scheduler to reduce register pressure */ -#include "compiler.h" #include "util/dag.h" +#include "compiler.h" struct sched_ctx { - /* Dependency graph */ - struct dag *dag; + /* Dependency graph */ + struct dag *dag; - /* Live set */ - BITSET_WORD *live; + /* Live set */ + BITSET_WORD *live; }; struct sched_node { - struct dag_node dag; + struct dag_node dag; - /* Instruction this node represents */ - bi_instr *instr; + /* Instruction this node represents */ + bi_instr *instr; }; static void add_dep(struct sched_node *a, struct sched_node *b) { - if (a && b) - dag_add_edge(&a->dag, &b->dag, 0); + if (a && b) + dag_add_edge(&a->dag, &b->dag, 0); } static struct dag * create_dag(bi_context *ctx, bi_block *block, void *memctx) { - struct dag *dag = dag_create(ctx); + struct dag *dag = dag_create(ctx); - struct sched_node **last_write = - calloc(ctx->ssa_alloc, sizeof(struct sched_node *)); - struct sched_node *coverage = NULL; - struct sched_node *preload = NULL; + struct sched_node **last_write = + calloc(ctx->ssa_alloc, sizeof(struct sched_node *)); + struct sched_node *coverage = NULL; + struct sched_node *preload = NULL; - /* Last memory load, to serialize stores against */ - struct sched_node *memory_load = NULL; + /* Last memory load, to serialize stores against */ + struct sched_node *memory_load = NULL; - /* Last memory store, to serialize loads and stores against */ - struct sched_node *memory_store = NULL; + /* Last memory store, to serialize loads and stores against */ + struct sched_node *memory_store = NULL; - bi_foreach_instr_in_block(block, I) { - /* Leave branches at the end */ - if (I->op == BI_OPCODE_JUMP || bi_opcode_props[I->op].branch) - break; + bi_foreach_instr_in_block(block, I) { + /* Leave branches at the end */ + if (I->op == BI_OPCODE_JUMP || bi_opcode_props[I->op].branch) + break; - assert(I->branch_target == NULL); + assert(I->branch_target == NULL); - struct sched_node *node = rzalloc(memctx, struct sched_node); - node->instr = I; - dag_init_node(dag, &node->dag); + struct sched_node *node = rzalloc(memctx, struct sched_node); + node->instr = I; + dag_init_node(dag, &node->dag); - /* Reads depend on writes, no other hazards in SSA */ - bi_foreach_ssa_src(I, s) - add_dep(node, last_write[I->src[s].value]); + /* Reads depend on writes, no other hazards in SSA */ + bi_foreach_ssa_src(I, s) + add_dep(node, last_write[I->src[s].value]); - bi_foreach_dest(I, d) - last_write[I->dest[d].value] = node; + bi_foreach_dest(I, d) + last_write[I->dest[d].value] = node; - switch (bi_opcode_props[I->op].message) { - case BIFROST_MESSAGE_LOAD: - /* Regular memory loads needs to be serialized against - * other memory access. However, UBO memory is read-only - * so it can be moved around freely. - */ - if (I->seg != BI_SEG_UBO) { - add_dep(node, memory_store); - memory_load = node; - } + switch (bi_opcode_props[I->op].message) { + case BIFROST_MESSAGE_LOAD: + /* Regular memory loads needs to be serialized against + * other memory access. However, UBO memory is read-only + * so it can be moved around freely. + */ + if (I->seg != BI_SEG_UBO) { + add_dep(node, memory_store); + memory_load = node; + } - break; + break; - case BIFROST_MESSAGE_ATTRIBUTE: - /* Regular attribute loads can be reordered, but - * writeable attributes can't be. Our one use of - * writeable attributes are images. - */ - if ((I->op == BI_OPCODE_LD_TEX) || - (I->op == BI_OPCODE_LD_TEX_IMM) || - (I->op == BI_OPCODE_LD_ATTR_TEX)) { - add_dep(node, memory_store); - memory_load = node; - } + case BIFROST_MESSAGE_ATTRIBUTE: + /* Regular attribute loads can be reordered, but + * writeable attributes can't be. Our one use of + * writeable attributes are images. + */ + if ((I->op == BI_OPCODE_LD_TEX) || (I->op == BI_OPCODE_LD_TEX_IMM) || + (I->op == BI_OPCODE_LD_ATTR_TEX)) { + add_dep(node, memory_store); + memory_load = node; + } - break; + break; - case BIFROST_MESSAGE_STORE: - assert(I->seg != BI_SEG_UBO); - add_dep(node, memory_load); - add_dep(node, memory_store); - memory_store = node; - break; + case BIFROST_MESSAGE_STORE: + assert(I->seg != BI_SEG_UBO); + add_dep(node, memory_load); + add_dep(node, memory_store); + memory_store = node; + break; - case BIFROST_MESSAGE_ATOMIC: - case BIFROST_MESSAGE_BARRIER: - add_dep(node, memory_load); - add_dep(node, memory_store); - memory_load = node; - memory_store = node; - break; + case BIFROST_MESSAGE_ATOMIC: + case BIFROST_MESSAGE_BARRIER: + add_dep(node, memory_load); + add_dep(node, memory_store); + memory_load = node; + memory_store = node; + break; - case BIFROST_MESSAGE_BLEND: - case BIFROST_MESSAGE_Z_STENCIL: - case BIFROST_MESSAGE_TILE: - add_dep(node, coverage); - coverage = node; - break; + case BIFROST_MESSAGE_BLEND: + case BIFROST_MESSAGE_Z_STENCIL: + case BIFROST_MESSAGE_TILE: + add_dep(node, coverage); + coverage = node; + break; - case BIFROST_MESSAGE_ATEST: - /* ATEST signals the end of shader side effects */ - add_dep(node, memory_store); - memory_store = node; + case BIFROST_MESSAGE_ATEST: + /* ATEST signals the end of shader side effects */ + add_dep(node, memory_store); + memory_store = node; - /* ATEST also updates coverage */ - add_dep(node, coverage); - coverage = node; - break; - default: - break; - } + /* ATEST also updates coverage */ + add_dep(node, coverage); + coverage = node; + break; + default: + break; + } - add_dep(node, preload); + add_dep(node, preload); - if (I->op == BI_OPCODE_DISCARD_F32) { - /* Serialize against ATEST */ - add_dep(node, coverage); - coverage = node; + if (I->op == BI_OPCODE_DISCARD_F32) { + /* Serialize against ATEST */ + add_dep(node, coverage); + coverage = node; - /* Also serialize against memory and barriers */ - add_dep(node, memory_load); - add_dep(node, memory_store); - memory_load = node; - memory_store = node; - } else if ((I->op == BI_OPCODE_PHI) || - (I->op == BI_OPCODE_MOV_I32 && - I->src[0].type == BI_INDEX_REGISTER)) { - preload = node; - } - } + /* Also serialize against memory and barriers */ + add_dep(node, memory_load); + add_dep(node, memory_store); + memory_load = node; + memory_store = node; + } else if ((I->op == BI_OPCODE_PHI) || + (I->op == BI_OPCODE_MOV_I32 && + I->src[0].type == BI_INDEX_REGISTER)) { + preload = node; + } + } - free(last_write); + free(last_write); - return dag; + return dag; } /* @@ -183,30 +182,30 @@ create_dag(bi_context *ctx, bi_block *block, void *memctx) static signed calculate_pressure_delta(bi_instr *I, BITSET_WORD *live) { - signed delta = 0; + signed delta = 0; - /* Destinations must be unique */ - bi_foreach_dest(I, d) { - if (BITSET_TEST(live, I->dest[d].value)) - delta -= bi_count_write_registers(I, d); - } + /* Destinations must be unique */ + bi_foreach_dest(I, d) { + if (BITSET_TEST(live, I->dest[d].value)) + delta -= bi_count_write_registers(I, d); + } - bi_foreach_ssa_src(I, src) { - /* Filter duplicates */ - bool dupe = false; + bi_foreach_ssa_src(I, src) { + /* Filter duplicates */ + bool dupe = false; - for (unsigned i = 0; i < src; ++i) { - if (bi_is_equiv(I->src[i], I->src[src])) { - dupe = true; - break; - } - } + for (unsigned i = 0; i < src; ++i) { + if (bi_is_equiv(I->src[i], I->src[src])) { + dupe = true; + break; + } + } - if (!dupe && !BITSET_TEST(live, I->src[src].value)) - delta += bi_count_read_registers(I, src); - } + if (!dupe && !BITSET_TEST(live, I->src[src].value)) + delta += bi_count_read_registers(I, src); + } - return delta; + return delta; } /* @@ -216,87 +215,88 @@ calculate_pressure_delta(bi_instr *I, BITSET_WORD *live) static struct sched_node * choose_instr(struct sched_ctx *s) { - int32_t min_delta = INT32_MAX; - struct sched_node *best = NULL; + int32_t min_delta = INT32_MAX; + struct sched_node *best = NULL; - list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) { - int32_t delta = calculate_pressure_delta(n->instr, s->live); + list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) { + int32_t delta = calculate_pressure_delta(n->instr, s->live); - if (delta < min_delta) { - best = n; - min_delta = delta; - } - } + if (delta < min_delta) { + best = n; + min_delta = delta; + } + } - return best; + return best; } static void pressure_schedule_block(bi_context *ctx, bi_block *block, struct sched_ctx *s) { - /* off by a constant, that's ok */ - signed pressure = 0; - signed orig_max_pressure = 0; - unsigned nr_ins = 0; + /* off by a constant, that's ok */ + signed pressure = 0; + signed orig_max_pressure = 0; + unsigned nr_ins = 0; - memcpy(s->live, block->ssa_live_out, BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD)); + memcpy(s->live, block->ssa_live_out, + BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD)); - bi_foreach_instr_in_block_rev(block, I) { - pressure += calculate_pressure_delta(I, s->live); - orig_max_pressure = MAX2(pressure, orig_max_pressure); - bi_liveness_ins_update_ssa(s->live, I); - nr_ins++; - } + bi_foreach_instr_in_block_rev(block, I) { + pressure += calculate_pressure_delta(I, s->live); + orig_max_pressure = MAX2(pressure, orig_max_pressure); + bi_liveness_ins_update_ssa(s->live, I); + nr_ins++; + } - memcpy(s->live, block->ssa_live_out, BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD)); + memcpy(s->live, block->ssa_live_out, + BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD)); - /* off by a constant, that's ok */ - signed max_pressure = 0; - pressure = 0; + /* off by a constant, that's ok */ + signed max_pressure = 0; + pressure = 0; - struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *)); - nr_ins = 0; + struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *)); + nr_ins = 0; - while (!list_is_empty(&s->dag->heads)) { - struct sched_node *node = choose_instr(s); - pressure += calculate_pressure_delta(node->instr, s->live); - max_pressure = MAX2(pressure, max_pressure); - dag_prune_head(s->dag, &node->dag); + while (!list_is_empty(&s->dag->heads)) { + struct sched_node *node = choose_instr(s); + pressure += calculate_pressure_delta(node->instr, s->live); + max_pressure = MAX2(pressure, max_pressure); + dag_prune_head(s->dag, &node->dag); - schedule[nr_ins++] = node; - bi_liveness_ins_update_ssa(s->live, node->instr); - } + schedule[nr_ins++] = node; + bi_liveness_ins_update_ssa(s->live, node->instr); + } - /* Bail if it looks like it's worse */ - if (max_pressure >= orig_max_pressure) { - free(schedule); - return; - } + /* Bail if it looks like it's worse */ + if (max_pressure >= orig_max_pressure) { + free(schedule); + return; + } - /* Apply the schedule */ - for (unsigned i = 0; i < nr_ins; ++i) { - bi_remove_instruction(schedule[i]->instr); - list_add(&schedule[i]->instr->link, &block->instructions); - } + /* Apply the schedule */ + for (unsigned i = 0; i < nr_ins; ++i) { + bi_remove_instruction(schedule[i]->instr); + list_add(&schedule[i]->instr->link, &block->instructions); + } - free(schedule); + free(schedule); } void bi_pressure_schedule(bi_context *ctx) { - bi_compute_liveness_ssa(ctx); - void *memctx = ralloc_context(ctx); - BITSET_WORD *live = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(ctx->ssa_alloc)); + bi_compute_liveness_ssa(ctx); + void *memctx = ralloc_context(ctx); + BITSET_WORD *live = + ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(ctx->ssa_alloc)); - bi_foreach_block(ctx, block) { - struct sched_ctx sctx = { - .dag = create_dag(ctx, block, memctx), - .live = live - }; + bi_foreach_block(ctx, block) { + struct sched_ctx sctx = {.dag = create_dag(ctx, block, memctx), + .live = live}; - pressure_schedule_block(ctx, block, &sctx); - } + pressure_schedule_block(ctx, block, &sctx); + } - ralloc_free(memctx); + ralloc_free(memctx); } diff --git a/src/panfrost/bifrost/bi_print.c b/src/panfrost/bifrost/bi_print.c index 8aa0293dfe1..5d997c79dd3 100644 --- a/src/panfrost/bifrost/bi_print.c +++ b/src/panfrost/bifrost/bi_print.c @@ -24,177 +24,179 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_print_common.h" +#include "compiler.h" static const char * bi_reg_op_name(enum bifrost_reg_op op) { - switch (op) { - case BIFROST_OP_IDLE: return "idle"; - case BIFROST_OP_READ: return "read"; - case BIFROST_OP_WRITE: return "write"; - case BIFROST_OP_WRITE_LO: return "write lo"; - case BIFROST_OP_WRITE_HI: return "write hi"; - default: return "invalid"; - } + switch (op) { + case BIFROST_OP_IDLE: + return "idle"; + case BIFROST_OP_READ: + return "read"; + case BIFROST_OP_WRITE: + return "write"; + case BIFROST_OP_WRITE_LO: + return "write lo"; + case BIFROST_OP_WRITE_HI: + return "write hi"; + default: + return "invalid"; + } } void bi_print_slots(bi_registers *regs, FILE *fp) { - for (unsigned i = 0; i < 2; ++i) { - if (regs->enabled[i]) - fprintf(fp, "slot %u: %u\n", i, regs->slot[i]); - } + for (unsigned i = 0; i < 2; ++i) { + if (regs->enabled[i]) + fprintf(fp, "slot %u: %u\n", i, regs->slot[i]); + } - if (regs->slot23.slot2) { - fprintf(fp, "slot 2 (%s%s): %u\n", - bi_reg_op_name(regs->slot23.slot2), - regs->slot23.slot2 >= BIFROST_OP_WRITE ? - " FMA": "", - regs->slot[2]); - } + if (regs->slot23.slot2) { + fprintf(fp, "slot 2 (%s%s): %u\n", bi_reg_op_name(regs->slot23.slot2), + regs->slot23.slot2 >= BIFROST_OP_WRITE ? " FMA" : "", + regs->slot[2]); + } - if (regs->slot23.slot3) { - fprintf(fp, "slot 3 (%s %s): %u\n", - bi_reg_op_name(regs->slot23.slot3), - regs->slot23.slot3_fma ? "FMA" : "ADD", - regs->slot[3]); - } + if (regs->slot23.slot3) { + fprintf(fp, "slot 3 (%s %s): %u\n", bi_reg_op_name(regs->slot23.slot3), + regs->slot23.slot3_fma ? "FMA" : "ADD", regs->slot[3]); + } } void bi_print_tuple(bi_tuple *tuple, FILE *fp) { - bi_instr *ins[2] = { tuple->fma, tuple->add }; + bi_instr *ins[2] = {tuple->fma, tuple->add}; - for (unsigned i = 0; i < 2; ++i) { - fprintf(fp, (i == 0) ? "\t* " : "\t+ "); + for (unsigned i = 0; i < 2; ++i) { + fprintf(fp, (i == 0) ? "\t* " : "\t+ "); - if (ins[i]) - bi_print_instr(ins[i], fp); - else - fprintf(fp, "NOP\n"); - } + if (ins[i]) + bi_print_instr(ins[i], fp); + else + fprintf(fp, "NOP\n"); + } } void bi_print_clause(bi_clause *clause, FILE *fp) { - fprintf(fp, "id(%u)", clause->scoreboard_id); + fprintf(fp, "id(%u)", clause->scoreboard_id); - if (clause->dependencies) { - fprintf(fp, " wait("); + if (clause->dependencies) { + fprintf(fp, " wait("); - for (unsigned i = 0; i < 8; ++i) { - if (clause->dependencies & (1 << i)) - fprintf(fp, "%u ", i); - } + for (unsigned i = 0; i < 8; ++i) { + if (clause->dependencies & (1 << i)) + fprintf(fp, "%u ", i); + } - fprintf(fp, ")"); - } + fprintf(fp, ")"); + } - fprintf(fp, " %s", bi_flow_control_name(clause->flow_control)); + fprintf(fp, " %s", bi_flow_control_name(clause->flow_control)); - if (!clause->next_clause_prefetch) - fprintf(fp, " no_prefetch"); + if (!clause->next_clause_prefetch) + fprintf(fp, " no_prefetch"); - if (clause->staging_barrier) - fprintf(fp, " osrb"); + if (clause->staging_barrier) + fprintf(fp, " osrb"); - if (clause->td) - fprintf(fp, " td"); + if (clause->td) + fprintf(fp, " td"); - if (clause->pcrel_idx != ~0) - fprintf(fp, " pcrel(%u)", clause->pcrel_idx); + if (clause->pcrel_idx != ~0) + fprintf(fp, " pcrel(%u)", clause->pcrel_idx); - fprintf(fp, "\n"); + fprintf(fp, "\n"); - for (unsigned i = 0; i < clause->tuple_count; ++i) - bi_print_tuple(&clause->tuples[i], fp); + for (unsigned i = 0; i < clause->tuple_count; ++i) + bi_print_tuple(&clause->tuples[i], fp); - if (clause->constant_count) { - for (unsigned i = 0; i < clause->constant_count; ++i) - fprintf(fp, "%" PRIx64 " ", clause->constants[i]); + if (clause->constant_count) { + for (unsigned i = 0; i < clause->constant_count; ++i) + fprintf(fp, "%" PRIx64 " ", clause->constants[i]); - if (clause->branch_constant) - fprintf(fp, "*"); + if (clause->branch_constant) + fprintf(fp, "*"); - fprintf(fp, "\n"); - } + fprintf(fp, "\n"); + } - fprintf(fp, "\n"); + fprintf(fp, "\n"); } static void -bi_print_scoreboard_line(unsigned slot, const char *name, uint64_t mask, FILE *fp) +bi_print_scoreboard_line(unsigned slot, const char *name, uint64_t mask, + FILE *fp) { - if (!mask) - return; + if (!mask) + return; - fprintf(fp, "slot %u %s:", slot, name); + fprintf(fp, "slot %u %s:", slot, name); - u_foreach_bit64(reg, mask) - fprintf(fp, " r%" PRId64, reg); + u_foreach_bit64(reg, mask) fprintf(fp, " r%" PRId64, reg); - fprintf(fp, "\n"); + fprintf(fp, "\n"); } static void bi_print_scoreboard(struct bi_scoreboard_state *state, FILE *fp) { - for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) { - bi_print_scoreboard_line(i, "reads", state->read[i], fp); - bi_print_scoreboard_line(i, "writes", state->write[i], fp); - } + for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) { + bi_print_scoreboard_line(i, "reads", state->read[i], fp); + bi_print_scoreboard_line(i, "writes", state->write[i], fp); + } } void bi_print_block(bi_block *block, FILE *fp) { - if (block->scheduled) { - bi_print_scoreboard(&block->scoreboard_in, fp); - fprintf(fp, "\n"); - } + if (block->scheduled) { + bi_print_scoreboard(&block->scoreboard_in, fp); + fprintf(fp, "\n"); + } - fprintf(fp, "block%u {\n", block->index); + fprintf(fp, "block%u {\n", block->index); - if (block->scheduled) { - bi_foreach_clause_in_block(block, clause) - bi_print_clause(clause, fp); - } else { - bi_foreach_instr_in_block(block, ins) - bi_print_instr((bi_instr *) ins, fp); - } + if (block->scheduled) { + bi_foreach_clause_in_block(block, clause) + bi_print_clause(clause, fp); + } else { + bi_foreach_instr_in_block(block, ins) + bi_print_instr((bi_instr *)ins, fp); + } - fprintf(fp, "}"); + fprintf(fp, "}"); - if (block->successors[0]) { - fprintf(fp, " -> "); + if (block->successors[0]) { + fprintf(fp, " -> "); - bi_foreach_successor((block), succ) - fprintf(fp, "block%u ", succ->index); - } + bi_foreach_successor((block), succ) + fprintf(fp, "block%u ", succ->index); + } - if (bi_num_predecessors(block)) { - fprintf(fp, " from"); + if (bi_num_predecessors(block)) { + fprintf(fp, " from"); - bi_foreach_predecessor(block, pred) - fprintf(fp, " block%u", (*pred)->index); - } + bi_foreach_predecessor(block, pred) + fprintf(fp, " block%u", (*pred)->index); + } - if (block->scheduled) { - fprintf(fp, "\n"); - bi_print_scoreboard(&block->scoreboard_out, fp); - } + if (block->scheduled) { + fprintf(fp, "\n"); + bi_print_scoreboard(&block->scoreboard_out, fp); + } - fprintf(fp, "\n\n"); + fprintf(fp, "\n\n"); } void bi_print_shader(bi_context *ctx, FILE *fp) { - bi_foreach_block(ctx, block) - bi_print_block(block, fp); + bi_foreach_block(ctx, block) + bi_print_block(block, fp); } diff --git a/src/panfrost/bifrost/bi_print_common.c b/src/panfrost/bifrost/bi_print_common.c index 91589afa749..b3f5e0b52c5 100644 --- a/src/panfrost/bifrost/bi_print_common.c +++ b/src/panfrost/bifrost/bi_print_common.c @@ -31,38 +31,63 @@ const char * bi_message_type_name(enum bifrost_message_type T) { - switch (T) { - case BIFROST_MESSAGE_NONE: return ""; - case BIFROST_MESSAGE_VARYING: return "vary"; - case BIFROST_MESSAGE_ATTRIBUTE: return "attr"; - case BIFROST_MESSAGE_TEX: return "tex"; - case BIFROST_MESSAGE_VARTEX: return "vartex"; - case BIFROST_MESSAGE_LOAD: return "load"; - case BIFROST_MESSAGE_STORE: return "store"; - case BIFROST_MESSAGE_ATOMIC: return "atomic"; - case BIFROST_MESSAGE_BARRIER: return "barrier"; - case BIFROST_MESSAGE_BLEND: return "blend"; - case BIFROST_MESSAGE_TILE: return "tile"; - case BIFROST_MESSAGE_Z_STENCIL: return "z_stencil"; - case BIFROST_MESSAGE_ATEST: return "atest"; - case BIFROST_MESSAGE_JOB: return "job"; - case BIFROST_MESSAGE_64BIT: return "64"; - default: return "XXX reserved"; - } + switch (T) { + case BIFROST_MESSAGE_NONE: + return ""; + case BIFROST_MESSAGE_VARYING: + return "vary"; + case BIFROST_MESSAGE_ATTRIBUTE: + return "attr"; + case BIFROST_MESSAGE_TEX: + return "tex"; + case BIFROST_MESSAGE_VARTEX: + return "vartex"; + case BIFROST_MESSAGE_LOAD: + return "load"; + case BIFROST_MESSAGE_STORE: + return "store"; + case BIFROST_MESSAGE_ATOMIC: + return "atomic"; + case BIFROST_MESSAGE_BARRIER: + return "barrier"; + case BIFROST_MESSAGE_BLEND: + return "blend"; + case BIFROST_MESSAGE_TILE: + return "tile"; + case BIFROST_MESSAGE_Z_STENCIL: + return "z_stencil"; + case BIFROST_MESSAGE_ATEST: + return "atest"; + case BIFROST_MESSAGE_JOB: + return "job"; + case BIFROST_MESSAGE_64BIT: + return "64"; + default: + return "XXX reserved"; + } } const char * bi_flow_control_name(enum bifrost_flow mode) { - switch (mode) { - case BIFROST_FLOW_END: return "eos"; - case BIFROST_FLOW_NBTB_PC: return "nbb br_pc"; - case BIFROST_FLOW_NBTB_UNCONDITIONAL: return "nbb r_uncond"; - case BIFROST_FLOW_NBTB: return "nbb"; - case BIFROST_FLOW_BTB_UNCONDITIONAL: return "bb r_uncond"; - case BIFROST_FLOW_BTB_NONE: return "bb"; - case BIFROST_FLOW_WE_UNCONDITIONAL: return "we r_uncond"; - case BIFROST_FLOW_WE: return "we"; - default: return "XXX"; - } + switch (mode) { + case BIFROST_FLOW_END: + return "eos"; + case BIFROST_FLOW_NBTB_PC: + return "nbb br_pc"; + case BIFROST_FLOW_NBTB_UNCONDITIONAL: + return "nbb r_uncond"; + case BIFROST_FLOW_NBTB: + return "nbb"; + case BIFROST_FLOW_BTB_UNCONDITIONAL: + return "bb r_uncond"; + case BIFROST_FLOW_BTB_NONE: + return "bb"; + case BIFROST_FLOW_WE_UNCONDITIONAL: + return "we r_uncond"; + case BIFROST_FLOW_WE: + return "we"; + default: + return "XXX"; + } } diff --git a/src/panfrost/bifrost/bi_print_common.h b/src/panfrost/bifrost/bi_print_common.h index 675738b389c..ed8931154ac 100644 --- a/src/panfrost/bifrost/bi_print_common.h +++ b/src/panfrost/bifrost/bi_print_common.h @@ -30,7 +30,7 @@ #include #include "bifrost.h" -const char * bi_message_type_name(enum bifrost_message_type T); -const char * bi_flow_control_name(enum bifrost_flow mode); +const char *bi_message_type_name(enum bifrost_message_type T); +const char *bi_flow_control_name(enum bifrost_flow mode); #endif diff --git a/src/panfrost/bifrost/bi_quirks.h b/src/panfrost/bifrost/bi_quirks.h index 5dd75dd1db6..be05ed51a9c 100644 --- a/src/panfrost/bifrost/bi_quirks.h +++ b/src/panfrost/bifrost/bi_quirks.h @@ -44,15 +44,15 @@ static inline unsigned bifrost_get_quirks(unsigned product_id) { - switch (product_id >> 8) { - case 0x60: /* G71 */ - return BIFROST_NO_FP32_TRANSCENDENTALS | BIFROST_LIMITED_CLPER; - case 0x62: /* G72 */ - case 0x70: /* G31 */ - return BIFROST_LIMITED_CLPER; - default: - return 0; - } + switch (product_id >> 8) { + case 0x60: /* G71 */ + return BIFROST_NO_FP32_TRANSCENDENTALS | BIFROST_LIMITED_CLPER; + case 0x62: /* G72 */ + case 0x70: /* G31 */ + return BIFROST_LIMITED_CLPER; + default: + return 0; + } } #endif diff --git a/src/panfrost/bifrost/bi_ra.c b/src/panfrost/bifrost/bi_ra.c index c103fab10d4..34047cdcf3e 100644 --- a/src/panfrost/bifrost/bi_ra.c +++ b/src/panfrost/bifrost/bi_ra.c @@ -24,32 +24,32 @@ * Alyssa Rosenzweig */ +#include "util/u_memory.h" +#include "bi_builder.h" #include "compiler.h" #include "nodearray.h" -#include "bi_builder.h" -#include "util/u_memory.h" struct lcra_state { - unsigned node_count; - uint64_t *affinity; + unsigned node_count; + uint64_t *affinity; - /* Linear constraints imposed. For each node there there is a - * 'nodearray' structure, which changes between a sparse and dense - * array depending on the number of elements. - * - * Each element is itself a bit field denoting whether (c_j - c_i) bias - * is present or not, including negative biases. - * - * We support up to 8 components so the bias is in range - * [-7, 7] encoded by a 16-bit field - */ - nodearray *linear; + /* Linear constraints imposed. For each node there there is a + * 'nodearray' structure, which changes between a sparse and dense + * array depending on the number of elements. + * + * Each element is itself a bit field denoting whether (c_j - c_i) bias + * is present or not, including negative biases. + * + * We support up to 8 components so the bias is in range + * [-7, 7] encoded by a 16-bit field + */ + nodearray *linear; - /* Before solving, forced registers; after solving, solutions. */ - unsigned *solutions; + /* Before solving, forced registers; after solving, solutions. */ + unsigned *solutions; - /** Node which caused register allocation to fail */ - unsigned spill_node; + /** Node which caused register allocation to fail */ + unsigned spill_node; }; /* This module is an implementation of "Linearly Constrained @@ -61,128 +61,134 @@ struct lcra_state { static struct lcra_state * lcra_alloc_equations(unsigned node_count) { - struct lcra_state *l = calloc(1, sizeof(*l)); + struct lcra_state *l = calloc(1, sizeof(*l)); - l->node_count = node_count; + l->node_count = node_count; - l->linear = calloc(sizeof(l->linear[0]), node_count); - l->solutions = calloc(sizeof(l->solutions[0]), node_count); - l->affinity = calloc(sizeof(l->affinity[0]), node_count); + l->linear = calloc(sizeof(l->linear[0]), node_count); + l->solutions = calloc(sizeof(l->solutions[0]), node_count); + l->affinity = calloc(sizeof(l->affinity[0]), node_count); - memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); + memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); - return l; + return l; } static void lcra_free(struct lcra_state *l) { - for (unsigned i = 0; i < l->node_count; ++i) - nodearray_reset(&l->linear[i]); + for (unsigned i = 0; i < l->node_count; ++i) + nodearray_reset(&l->linear[i]); - free(l->linear); - free(l->affinity); - free(l->solutions); - free(l); + free(l->linear); + free(l->affinity); + free(l->solutions); + free(l); } static void -lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j) +lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, + unsigned j, unsigned cmask_j) { - if (i == j) - return; + if (i == j) + return; - nodearray_value constraint_fw = 0; - nodearray_value constraint_bw = 0; + nodearray_value constraint_fw = 0; + nodearray_value constraint_bw = 0; - /* The constraint bits are reversed from lcra.c so that register - * allocation can be done in parallel for every possible solution, - * with lower-order bits representing smaller registers. */ + /* The constraint bits are reversed from lcra.c so that register + * allocation can be done in parallel for every possible solution, + * with lower-order bits representing smaller registers. */ - for (unsigned D = 0; D < 8; ++D) { - if (cmask_i & (cmask_j << D)) { - constraint_fw |= (1 << (7 + D)); - constraint_bw |= (1 << (7 - D)); - } + for (unsigned D = 0; D < 8; ++D) { + if (cmask_i & (cmask_j << D)) { + constraint_fw |= (1 << (7 + D)); + constraint_bw |= (1 << (7 - D)); + } - if (cmask_i & (cmask_j >> D)) { - constraint_bw |= (1 << (7 + D)); - constraint_fw |= (1 << (7 - D)); - } - } + if (cmask_i & (cmask_j >> D)) { + constraint_bw |= (1 << (7 + D)); + constraint_fw |= (1 << (7 - D)); + } + } - /* Use dense arrays after adding 256 elements */ - nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count); - nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count); + /* Use dense arrays after adding 256 elements */ + nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count); + nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count); } static bool lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i) { - signed constant = solutions[i]; + signed constant = solutions[i]; - if (nodearray_is_sparse(&l->linear[i])) { - nodearray_sparse_foreach(&l->linear[i], elem) { - unsigned j = nodearray_sparse_key(elem); - nodearray_value constraint = nodearray_sparse_value(elem); + if (nodearray_is_sparse(&l->linear[i])) { + nodearray_sparse_foreach(&l->linear[i], elem) { + unsigned j = nodearray_sparse_key(elem); + nodearray_value constraint = nodearray_sparse_value(elem); - if (solutions[j] == ~0) continue; + if (solutions[j] == ~0) + continue; - signed lhs = constant - solutions[j]; + signed lhs = constant - solutions[j]; - if (lhs < -7 || lhs > 7) - continue; + if (lhs < -7 || lhs > 7) + continue; - if (constraint & (1 << (lhs + 7))) - return false; - } + if (constraint & (1 << (lhs + 7))) + return false; + } - return true; - } + return true; + } - nodearray_value *row = l->linear[i].dense; + nodearray_value *row = l->linear[i].dense; - for (unsigned j = 0; j < l->node_count; ++j) { - if (solutions[j] == ~0) continue; + for (unsigned j = 0; j < l->node_count; ++j) { + if (solutions[j] == ~0) + continue; - signed lhs = constant - solutions[j]; + signed lhs = constant - solutions[j]; - if (lhs < -7 || lhs > 7) - continue; + if (lhs < -7 || lhs > 7) + continue; - if (row[j] & (1 << (lhs + 7))) - return false; - } + if (row[j] & (1 << (lhs + 7))) + return false; + } - return true; + return true; } static bool lcra_solve(struct lcra_state *l) { - for (unsigned step = 0; step < l->node_count; ++step) { - if (l->solutions[step] != ~0) continue; - if (l->affinity[step] == 0) continue; + for (unsigned step = 0; step < l->node_count; ++step) { + if (l->solutions[step] != ~0) + continue; + if (l->affinity[step] == 0) + continue; - bool succ = false; + bool succ = false; - u_foreach_bit64(r, l->affinity[step]) { - l->solutions[step] = r; + u_foreach_bit64(r, l->affinity[step]) + { + l->solutions[step] = r; - if (lcra_test_linear(l, l->solutions, step)) { - succ = true; - break; - } - } + if (lcra_test_linear(l, l->solutions, step)) { + succ = true; + break; + } + } - /* Out of registers - prepare to spill */ - if (!succ) { - l->spill_node = step; - return false; - } - } + /* Out of registers - prepare to spill */ + if (!succ) { + l->spill_node = step; + return false; + } + } - return true; + return true; } /* Register spilling is implemented with a cost-benefit system. Costs are set @@ -191,18 +197,18 @@ lcra_solve(struct lcra_state *l) static unsigned lcra_count_constraints(struct lcra_state *l, unsigned i) { - unsigned count = 0; - nodearray *constraints = &l->linear[i]; + unsigned count = 0; + nodearray *constraints = &l->linear[i]; - if (nodearray_is_sparse(constraints)) { - nodearray_sparse_foreach(constraints, elem) - count += util_bitcount(nodearray_sparse_value(elem)); - } else { - nodearray_dense_foreach_64(constraints, elem) - count += util_bitcount64(*elem); - } + if (nodearray_is_sparse(constraints)) { + nodearray_sparse_foreach(constraints, elem) + count += util_bitcount(nodearray_sparse_value(elem)); + } else { + nodearray_dense_foreach_64(constraints, elem) + count += util_bitcount64(*elem); + } - return count; + return count; } /* Liveness analysis is a backwards-may dataflow analysis pass. Within a block, @@ -212,46 +218,46 @@ lcra_count_constraints(struct lcra_state *l, unsigned i) static void bi_liveness_ins_update_ra(uint8_t *live, bi_instr *ins) { - /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */ + /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */ - bi_foreach_dest(ins, d) { - live[ins->dest[d].value] &= ~bi_writemask(ins, d); - } + bi_foreach_dest(ins, d) { + live[ins->dest[d].value] &= ~bi_writemask(ins, d); + } - bi_foreach_ssa_src(ins, src) { - unsigned count = bi_count_read_registers(ins, src); - unsigned rmask = BITFIELD_MASK(count); + bi_foreach_ssa_src(ins, src) { + unsigned count = bi_count_read_registers(ins, src); + unsigned rmask = BITFIELD_MASK(count); - live[ins->src[src].value] |= (rmask << ins->src[src].offset); - } + live[ins->src[src].value] |= (rmask << ins->src[src].offset); + } } static bool liveness_block_update(bi_block *blk, unsigned temp_count) { - bool progress = false; + bool progress = false; - /* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */ - bi_foreach_successor(blk, succ) { - for (unsigned i = 0; i < temp_count; ++i) - blk->live_out[i] |= succ->live_in[i]; - } + /* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */ + bi_foreach_successor(blk, succ) { + for (unsigned i = 0; i < temp_count; ++i) + blk->live_out[i] |= succ->live_in[i]; + } - uint8_t *live = ralloc_array(blk, uint8_t, temp_count); - memcpy(live, blk->live_out, temp_count); + uint8_t *live = ralloc_array(blk, uint8_t, temp_count); + memcpy(live, blk->live_out, temp_count); - bi_foreach_instr_in_block_rev(blk, ins) - bi_liveness_ins_update_ra(live, ins); + bi_foreach_instr_in_block_rev(blk, ins) + bi_liveness_ins_update_ra(live, ins); - /* To figure out progress, diff live_in */ + /* To figure out progress, diff live_in */ - for (unsigned i = 0; (i < temp_count) && !progress; ++i) - progress |= (blk->live_in[i] != live[i]); + for (unsigned i = 0; (i < temp_count) && !progress; ++i) + progress |= (blk->live_in[i] != live[i]); - ralloc_free(blk->live_in); - blk->live_in = live; + ralloc_free(blk->live_in); + blk->live_in = live; - return progress; + return progress; } /* Globally, liveness analysis uses a fixed-point algorithm based on a @@ -263,36 +269,36 @@ liveness_block_update(bi_block *blk, unsigned temp_count) static void bi_compute_liveness_ra(bi_context *ctx) { - u_worklist worklist; - bi_worklist_init(ctx, &worklist); + u_worklist worklist; + bi_worklist_init(ctx, &worklist); - bi_foreach_block(ctx, block) { - if (block->live_in) - ralloc_free(block->live_in); + bi_foreach_block(ctx, block) { + if (block->live_in) + ralloc_free(block->live_in); - if (block->live_out) - ralloc_free(block->live_out); + if (block->live_out) + ralloc_free(block->live_out); - block->live_in = rzalloc_array(block, uint8_t, ctx->ssa_alloc); - block->live_out = rzalloc_array(block, uint8_t, ctx->ssa_alloc); + block->live_in = rzalloc_array(block, uint8_t, ctx->ssa_alloc); + block->live_out = rzalloc_array(block, uint8_t, ctx->ssa_alloc); - bi_worklist_push_tail(&worklist, block); - } + bi_worklist_push_tail(&worklist, block); + } - while (!u_worklist_is_empty(&worklist)) { - /* Pop off in reverse order since liveness is backwards */ - bi_block *blk = bi_worklist_pop_tail(&worklist); + while (!u_worklist_is_empty(&worklist)) { + /* Pop off in reverse order since liveness is backwards */ + bi_block *blk = bi_worklist_pop_tail(&worklist); - /* Update liveness information. If we made progress, we need to - * reprocess the predecessors - */ - if (liveness_block_update(blk, ctx->ssa_alloc)) { - bi_foreach_predecessor(blk, pred) - bi_worklist_push_head(&worklist, *pred); - } - } + /* Update liveness information. If we made progress, we need to + * reprocess the predecessors + */ + if (liveness_block_update(blk, ctx->ssa_alloc)) { + bi_foreach_predecessor(blk, pred) + bi_worklist_push_head(&worklist, *pred); + } + } - u_worklist_fini(&worklist); + u_worklist_fini(&worklist); } /* Construct an affinity mask such that the vector with `count` elements does @@ -313,246 +319,246 @@ bi_compute_liveness_ra(bi_context *ctx) static uint64_t bi_make_affinity(uint64_t clobber, unsigned count, bool split_file) { - uint64_t clobbered = 0; + uint64_t clobbered = 0; - for (unsigned i = 0; i < count; ++i) - clobbered |= (clobber >> i); + for (unsigned i = 0; i < count; ++i) + clobbered |= (clobber >> i); - /* Don't allocate past the end of the register file */ - if (count > 1) { - unsigned excess = count - 1; - uint64_t mask = BITFIELD_MASK(excess); - clobbered |= mask << (64 - excess); + /* Don't allocate past the end of the register file */ + if (count > 1) { + unsigned excess = count - 1; + uint64_t mask = BITFIELD_MASK(excess); + clobbered |= mask << (64 - excess); - if (split_file) - clobbered |= mask << (16 - excess); - } + if (split_file) + clobbered |= mask << (16 - excess); + } - /* Don't allocate the middle if we split out the middle */ - if (split_file) - clobbered |= BITFIELD64_MASK(32) << 16; + /* Don't allocate the middle if we split out the middle */ + if (split_file) + clobbered |= BITFIELD64_MASK(32) << 16; - /* We can use a register iff it's not clobberred */ - return ~clobbered; + /* We can use a register iff it's not clobberred */ + return ~clobbered; } static void -bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr) +bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, + uint64_t preload_live, unsigned node_count, bool is_blend, + bool split_file, bool aligned_sr) { - bi_foreach_instr_in_block_rev(block, ins) { - /* Mark all registers live after the instruction as - * interfering with the destination */ + bi_foreach_instr_in_block_rev(block, ins) { + /* Mark all registers live after the instruction as + * interfering with the destination */ - bi_foreach_dest(ins, d) { - unsigned node = ins->dest[d].value; + bi_foreach_dest(ins, d) { + unsigned node = ins->dest[d].value; - /* Don't allocate to anything that's read later as a - * preloaded register. The affinity is the intersection - * of affinity masks for each write. Since writes have - * offsets, but the affinity is for the whole node, we - * need to offset the affinity opposite the write - * offset, so we shift right. */ - unsigned count = bi_count_write_registers(ins, d); - unsigned offset = ins->dest[d].offset; - uint64_t affinity = bi_make_affinity(preload_live, count, split_file) >> offset; - /* Valhall needs >= 64-bit staging writes to be pair-aligned */ - if (aligned_sr && (count >= 2 || offset)) - affinity &= EVEN_BITS_MASK; + /* Don't allocate to anything that's read later as a + * preloaded register. The affinity is the intersection + * of affinity masks for each write. Since writes have + * offsets, but the affinity is for the whole node, we + * need to offset the affinity opposite the write + * offset, so we shift right. */ + unsigned count = bi_count_write_registers(ins, d); + unsigned offset = ins->dest[d].offset; + uint64_t affinity = + bi_make_affinity(preload_live, count, split_file) >> offset; + /* Valhall needs >= 64-bit staging writes to be pair-aligned */ + if (aligned_sr && (count >= 2 || offset)) + affinity &= EVEN_BITS_MASK; - l->affinity[node] &= affinity; + l->affinity[node] &= affinity; - for (unsigned i = 0; i < node_count; ++i) { - uint8_t r = live[i]; + for (unsigned i = 0; i < node_count; ++i) { + uint8_t r = live[i]; - /* Nodes only interfere if they occupy - * /different values/ at the same time - * (Boissinot). In particular, sources of - * moves do not interfere with their - * destinations. This enables a limited form of - * coalescing. - */ - if (ins->op == BI_OPCODE_MOV_I32 && - bi_is_ssa(ins->src[0]) && - i == ins->src[0].value) { + /* Nodes only interfere if they occupy + * /different values/ at the same time + * (Boissinot). In particular, sources of + * moves do not interfere with their + * destinations. This enables a limited form of + * coalescing. + */ + if (ins->op == BI_OPCODE_MOV_I32 && bi_is_ssa(ins->src[0]) && + i == ins->src[0].value) { - r &= ~BITFIELD_BIT(ins->src[0].offset); - } + r &= ~BITFIELD_BIT(ins->src[0].offset); + } - if (r) { - lcra_add_node_interference(l, node, - bi_writemask(ins, d), i, r); - } - } + if (r) { + lcra_add_node_interference(l, node, bi_writemask(ins, d), i, r); + } + } - unsigned node_first = ins->dest[0].value; - if (d == 1) { - lcra_add_node_interference(l, node, bi_writemask(ins, 1), - node_first, bi_writemask(ins, 0)); - } - } + unsigned node_first = ins->dest[0].value; + if (d == 1) { + lcra_add_node_interference(l, node, bi_writemask(ins, 1), + node_first, bi_writemask(ins, 0)); + } + } - /* Valhall needs >= 64-bit reads to be pair-aligned */ - if (aligned_sr) { - bi_foreach_ssa_src(ins, s) { - if (bi_count_read_registers(ins, s) >= 2) - l->affinity[ins->src[s].value] &= EVEN_BITS_MASK; - } - } + /* Valhall needs >= 64-bit reads to be pair-aligned */ + if (aligned_sr) { + bi_foreach_ssa_src(ins, s) { + if (bi_count_read_registers(ins, s) >= 2) + l->affinity[ins->src[s].value] &= EVEN_BITS_MASK; + } + } - if (!is_blend && ins->op == BI_OPCODE_BLEND) { - /* Blend shaders might clobber r0-r15, r48. */ - uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48); + if (!is_blend && ins->op == BI_OPCODE_BLEND) { + /* Blend shaders might clobber r0-r15, r48. */ + uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48); - for (unsigned i = 0; i < node_count; ++i) { - if (live[i]) - l->affinity[i] &= ~clobber; - } - } + for (unsigned i = 0; i < node_count; ++i) { + if (live[i]) + l->affinity[i] &= ~clobber; + } + } - /* Update live_in */ - preload_live = bi_postra_liveness_ins(preload_live, ins); - bi_liveness_ins_update_ra(live, ins); - } + /* Update live_in */ + preload_live = bi_postra_liveness_ins(preload_live, ins); + bi_liveness_ins_update_ra(live, ins); + } - block->reg_live_in = preload_live; + block->reg_live_in = preload_live; } static void bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs) { - bi_compute_liveness_ra(ctx); - bi_postra_liveness(ctx); + bi_compute_liveness_ra(ctx); + bi_postra_liveness(ctx); - bi_foreach_block_rev(ctx, blk) { - uint8_t *live = mem_dup(blk->live_out, ctx->ssa_alloc); + bi_foreach_block_rev(ctx, blk) { + uint8_t *live = mem_dup(blk->live_out, ctx->ssa_alloc); - bi_mark_interference(blk, l, live, blk->reg_live_out, - ctx->ssa_alloc, ctx->inputs->is_blend, - !full_regs, ctx->arch >= 9); + bi_mark_interference(blk, l, live, blk->reg_live_out, ctx->ssa_alloc, + ctx->inputs->is_blend, !full_regs, ctx->arch >= 9); - free(live); - } + free(live); + } } static struct lcra_state * bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs) { - struct lcra_state *l = lcra_alloc_equations(ctx->ssa_alloc); + struct lcra_state *l = lcra_alloc_equations(ctx->ssa_alloc); - /* Blend shaders are restricted to R0-R15. Other shaders at full - * occupancy also can access R48-R63. At half occupancy they can access - * the whole file. */ + /* Blend shaders are restricted to R0-R15. Other shaders at full + * occupancy also can access R48-R63. At half occupancy they can access + * the whole file. */ - uint64_t default_affinity = - ctx->inputs->is_blend ? BITFIELD64_MASK(16) : - full_regs ? BITFIELD64_MASK(64) : - (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48)); + uint64_t default_affinity = + ctx->inputs->is_blend ? BITFIELD64_MASK(16) + : full_regs ? BITFIELD64_MASK(64) + : (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48)); - /* To test spilling, mimic a small register file */ - if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend) - default_affinity &= BITFIELD64_MASK(48) << 8; + /* To test spilling, mimic a small register file */ + if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend) + default_affinity &= BITFIELD64_MASK(48) << 8; - bi_foreach_instr_global(ctx, ins) { - bi_foreach_dest(ins, d) - l->affinity[ins->dest[d].value] = default_affinity; + bi_foreach_instr_global(ctx, ins) { + bi_foreach_dest(ins, d) + l->affinity[ins->dest[d].value] = default_affinity; - /* Blend shaders expect the src colour to be in r0-r3 */ - if (ins->op == BI_OPCODE_BLEND && - !ctx->inputs->is_blend) { - assert(bi_is_ssa(ins->src[0])); - l->solutions[ins->src[0].value] = 0; + /* Blend shaders expect the src colour to be in r0-r3 */ + if (ins->op == BI_OPCODE_BLEND && !ctx->inputs->is_blend) { + assert(bi_is_ssa(ins->src[0])); + l->solutions[ins->src[0].value] = 0; - /* Dual source blend input in r4-r7 */ - if (bi_is_ssa(ins->src[4])) - l->solutions[ins->src[4].value] = 4; + /* Dual source blend input in r4-r7 */ + if (bi_is_ssa(ins->src[4])) + l->solutions[ins->src[4].value] = 4; - /* Writes to R48 */ - if (!bi_is_null(ins->dest[0])) - l->solutions[ins->dest[0].value] = 48; - } + /* Writes to R48 */ + if (!bi_is_null(ins->dest[0])) + l->solutions[ins->dest[0].value] = 48; + } - /* Coverage mask writes stay in R60 */ - if ((ins->op == BI_OPCODE_ATEST || - ins->op == BI_OPCODE_ZS_EMIT) && - !bi_is_null(ins->dest[0])) { - l->solutions[ins->dest[0].value] = 60; - } + /* Coverage mask writes stay in R60 */ + if ((ins->op == BI_OPCODE_ATEST || ins->op == BI_OPCODE_ZS_EMIT) && + !bi_is_null(ins->dest[0])) { + l->solutions[ins->dest[0].value] = 60; + } - /* Experimentally, it seems coverage masks inputs to ATEST must - * be in R60. Otherwise coverage mask writes do not work with - * early-ZS with pixel-frequency-shading (this combination of - * settings is legal if depth/stencil writes are disabled). - */ - if (ins->op == BI_OPCODE_ATEST) { - assert(bi_is_ssa(ins->src[0])); - l->solutions[ins->src[0].value] = 60; - } - } + /* Experimentally, it seems coverage masks inputs to ATEST must + * be in R60. Otherwise coverage mask writes do not work with + * early-ZS with pixel-frequency-shading (this combination of + * settings is legal if depth/stencil writes are disabled). + */ + if (ins->op == BI_OPCODE_ATEST) { + assert(bi_is_ssa(ins->src[0])); + l->solutions[ins->src[0].value] = 60; + } + } - bi_compute_interference(ctx, l, full_regs); + bi_compute_interference(ctx, l, full_regs); - /* Coalesce register moves if we're allowed. We need to be careful due - * to the restricted affinity induced by the blend shader ABI. - */ - bi_foreach_instr_global(ctx, I) { - if (I->op != BI_OPCODE_MOV_I32) continue; - if (I->src[0].type != BI_INDEX_REGISTER) continue; + /* Coalesce register moves if we're allowed. We need to be careful due + * to the restricted affinity induced by the blend shader ABI. + */ + bi_foreach_instr_global(ctx, I) { + if (I->op != BI_OPCODE_MOV_I32) + continue; + if (I->src[0].type != BI_INDEX_REGISTER) + continue; - unsigned reg = I->src[0].value; - unsigned node = I->dest[0].value; + unsigned reg = I->src[0].value; + unsigned node = I->dest[0].value; - if (l->solutions[node] != ~0) continue; + if (l->solutions[node] != ~0) + continue; - uint64_t affinity = l->affinity[node]; + uint64_t affinity = l->affinity[node]; - if (ctx->inputs->is_blend) { - /* We're allowed to coalesce the moves to these */ - affinity |= BITFIELD64_BIT(48); - affinity |= BITFIELD64_BIT(60); - } + if (ctx->inputs->is_blend) { + /* We're allowed to coalesce the moves to these */ + affinity |= BITFIELD64_BIT(48); + affinity |= BITFIELD64_BIT(60); + } - /* Try to coalesce */ - if (affinity & BITFIELD64_BIT(reg)) { - l->solutions[node] = reg; + /* Try to coalesce */ + if (affinity & BITFIELD64_BIT(reg)) { + l->solutions[node] = reg; - if (!lcra_test_linear(l, l->solutions, node)) - l->solutions[node] = ~0; - } - } + if (!lcra_test_linear(l, l->solutions, node)) + l->solutions[node] = ~0; + } + } - *success = lcra_solve(l); + *success = lcra_solve(l); - return l; + return l; } static bi_index bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index) { - /* Offsets can only be applied when we register allocated an index, or - * alternatively for FAU's encoding */ + /* Offsets can only be applied when we register allocated an index, or + * alternatively for FAU's encoding */ - ASSERTED bool is_offset = (index.offset > 0) && - (index.type != BI_INDEX_FAU); + ASSERTED bool is_offset = (index.offset > 0) && (index.type != BI_INDEX_FAU); - /* Did we run RA for this index at all */ - if (!bi_is_ssa(index)) { - assert(!is_offset); - return index; - } + /* Did we run RA for this index at all */ + if (!bi_is_ssa(index)) { + assert(!is_offset); + return index; + } - /* LCRA didn't bother solving this index (how lazy!) */ - signed solution = l->solutions[index.value]; - if (solution < 0) { - assert(!is_offset); - return index; - } + /* LCRA didn't bother solving this index (how lazy!) */ + signed solution = l->solutions[index.value]; + if (solution < 0) { + assert(!is_offset); + return index; + } - /* todo: do we want to compose with the subword swizzle? */ - bi_index new_index = bi_register(solution + index.offset); - new_index.swizzle = index.swizzle; - new_index.abs = index.abs; - new_index.neg = index.neg; - return new_index; + /* todo: do we want to compose with the subword swizzle? */ + bi_index new_index = bi_register(solution + index.offset); + new_index.swizzle = index.swizzle; + new_index.abs = index.abs; + new_index.neg = index.neg; + return new_index; } /* Dual texture instructions write to two sets of staging registers, modeled as @@ -564,40 +570,40 @@ bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index) static void bi_fixup_dual_tex_register(bi_instr *I) { - assert(I->dest[1].type == BI_INDEX_REGISTER); - assert(I->src[3].type == BI_INDEX_CONSTANT); + assert(I->dest[1].type == BI_INDEX_REGISTER); + assert(I->src[3].type == BI_INDEX_CONSTANT); - struct bifrost_dual_texture_operation desc = { - .secondary_register = I->dest[1].value, - }; + struct bifrost_dual_texture_operation desc = { + .secondary_register = I->dest[1].value, + }; - I->src[3].value |= bi_dual_tex_as_u32(desc); + I->src[3].value |= bi_dual_tex_as_u32(desc); } static void bi_install_registers(bi_context *ctx, struct lcra_state *l) { - bi_foreach_instr_global(ctx, ins) { - bi_foreach_dest(ins, d) - ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]); + bi_foreach_instr_global(ctx, ins) { + bi_foreach_dest(ins, d) + ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]); - bi_foreach_src(ins, s) - ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]); + bi_foreach_src(ins, s) + ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]); - if (ins->op == BI_OPCODE_TEXC_DUAL) - bi_fixup_dual_tex_register(ins); - } + if (ins->op == BI_OPCODE_TEXC_DUAL) + bi_fixup_dual_tex_register(ins); + } } static void bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new) { - bi_foreach_src(ins, i) { - if (bi_is_equiv(ins->src[i], old)) { - ins->src[i].type = new.type; - ins->src[i].value = new.value; - } - } + bi_foreach_src(ins, i) { + if (bi_is_equiv(ins->src[i], old)) { + ins->src[i].type = new.type; + ins->src[i].value = new.value; + } + } } /* If register allocation fails, find the best spill node */ @@ -605,83 +611,87 @@ bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new) static signed bi_choose_spill_node(bi_context *ctx, struct lcra_state *l) { - /* Pick a node satisfying bi_spill_register's preconditions */ - BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count)); + /* Pick a node satisfying bi_spill_register's preconditions */ + BITSET_WORD *no_spill = + calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count)); - bi_foreach_instr_global(ctx, ins) { - bi_foreach_dest(ins, d) { - /* Don't allow spilling coverage mask writes because the - * register preload logic assumes it will stay in R60. - * This could be optimized. - */ - if (ins->no_spill || - ins->op == BI_OPCODE_ATEST || - ins->op == BI_OPCODE_ZS_EMIT || - (ins->op == BI_OPCODE_MOV_I32 && - ins->src[0].type == BI_INDEX_REGISTER && - ins->src[0].value == 60)) { - BITSET_SET(no_spill, ins->dest[d].value); - } - } - } + bi_foreach_instr_global(ctx, ins) { + bi_foreach_dest(ins, d) { + /* Don't allow spilling coverage mask writes because the + * register preload logic assumes it will stay in R60. + * This could be optimized. + */ + if (ins->no_spill || ins->op == BI_OPCODE_ATEST || + ins->op == BI_OPCODE_ZS_EMIT || + (ins->op == BI_OPCODE_MOV_I32 && + ins->src[0].type == BI_INDEX_REGISTER && + ins->src[0].value == 60)) { + BITSET_SET(no_spill, ins->dest[d].value); + } + } + } - unsigned best_benefit = 0.0; - signed best_node = -1; + unsigned best_benefit = 0.0; + signed best_node = -1; - if (nodearray_is_sparse(&l->linear[l->spill_node])) { - nodearray_sparse_foreach(&l->linear[l->spill_node], elem) { - unsigned i = nodearray_sparse_key(elem); - unsigned constraint = nodearray_sparse_value(elem); + if (nodearray_is_sparse(&l->linear[l->spill_node])) { + nodearray_sparse_foreach(&l->linear[l->spill_node], elem) { + unsigned i = nodearray_sparse_key(elem); + unsigned constraint = nodearray_sparse_value(elem); - /* Only spill nodes that interfere with the node failing - * register allocation. It's pointless to spill anything else */ - if (!constraint) continue; + /* Only spill nodes that interfere with the node failing + * register allocation. It's pointless to spill anything else */ + if (!constraint) + continue; - if (BITSET_TEST(no_spill, i)) continue; + if (BITSET_TEST(no_spill, i)) + continue; - unsigned benefit = lcra_count_constraints(l, i); + unsigned benefit = lcra_count_constraints(l, i); - if (benefit > best_benefit) { - best_benefit = benefit; - best_node = i; - } - } - } else { - nodearray_value *row = l->linear[l->spill_node].dense; + if (benefit > best_benefit) { + best_benefit = benefit; + best_node = i; + } + } + } else { + nodearray_value *row = l->linear[l->spill_node].dense; - for (unsigned i = 0; i < l->node_count; ++i) { - /* Only spill nodes that interfere with the node failing - * register allocation. It's pointless to spill anything else */ - if (!row[i]) continue; + for (unsigned i = 0; i < l->node_count; ++i) { + /* Only spill nodes that interfere with the node failing + * register allocation. It's pointless to spill anything else */ + if (!row[i]) + continue; - if (BITSET_TEST(no_spill, i)) continue; + if (BITSET_TEST(no_spill, i)) + continue; - unsigned benefit = lcra_count_constraints(l, i); + unsigned benefit = lcra_count_constraints(l, i); - if (benefit > best_benefit) { - best_benefit = benefit; - best_node = i; - } - } - } + if (benefit > best_benefit) { + best_benefit = benefit; + best_node = i; + } + } + } - free(no_spill); - return best_node; + free(no_spill); + return best_node; } static unsigned bi_count_read_index(bi_instr *I, bi_index index) { - unsigned max = 0; + unsigned max = 0; - bi_foreach_src(I, s) { - if (bi_is_equiv(I->src[s], index)) { - unsigned count = bi_count_read_registers(I, s); - max = MAX2(max, count + I->src[s].offset); - } - } + bi_foreach_src(I, s) { + if (bi_is_equiv(I->src[s], index)) { + unsigned count = bi_count_read_registers(I, s); + max = MAX2(max, count + I->src[s].offset); + } + } - return max; + return max; } /* @@ -692,29 +702,30 @@ bi_count_read_index(bi_instr *I, bi_index index) static bi_index bi_tls_ptr(bool hi) { - return bi_fau(BIR_FAU_TLS_PTR, hi); + return bi_fau(BIR_FAU_TLS_PTR, hi); } static bi_instr * bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) { - if (b->shader->arch >= 9) { - return bi_load_to(b, bits, src, bi_tls_ptr(false), - bi_tls_ptr(true), BI_SEG_TL, offset); - } else { - return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(), - BI_SEG_TL, 0); - } + if (b->shader->arch >= 9) { + return bi_load_to(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), + BI_SEG_TL, offset); + } else { + return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, + 0); + } } static void bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) { - if (b->shader->arch >= 9) { - bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL, offset); - } else { - bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0); - } + if (b->shader->arch >= 9) { + bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL, + offset); + } else { + bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0); + } } /* Once we've chosen a spill node, spill it and returns bytes spilled */ @@ -722,44 +733,45 @@ bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) static unsigned bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset) { - bi_builder b = { .shader = ctx }; - unsigned channels = 0; + bi_builder b = {.shader = ctx}; + unsigned channels = 0; - /* Spill after every store, fill before every load */ - bi_foreach_instr_global_safe(ctx, I) { - bi_foreach_dest(I, d) { - if (!bi_is_equiv(I->dest[d], index)) continue; + /* Spill after every store, fill before every load */ + bi_foreach_instr_global_safe(ctx, I) { + bi_foreach_dest(I, d) { + if (!bi_is_equiv(I->dest[d], index)) + continue; - unsigned extra = I->dest[d].offset; - bi_index tmp = bi_temp(ctx); + unsigned extra = I->dest[d].offset; + bi_index tmp = bi_temp(ctx); - I->dest[d] = bi_replace_index(I->dest[d], tmp); - I->no_spill = true; + I->dest[d] = bi_replace_index(I->dest[d], tmp); + I->no_spill = true; - unsigned count = bi_count_write_registers(I, d); - unsigned bits = count * 32; + unsigned count = bi_count_write_registers(I, d); + unsigned bits = count * 32; - b.cursor = bi_after_instr(I); - bi_store_tl(&b, bits, tmp, offset + 4 * extra); + b.cursor = bi_after_instr(I); + bi_store_tl(&b, bits, tmp, offset + 4 * extra); - ctx->spills++; - channels = MAX2(channels, extra + count); - } + ctx->spills++; + channels = MAX2(channels, extra + count); + } - if (bi_has_arg(I, index)) { - b.cursor = bi_before_instr(I); - bi_index tmp = bi_temp(ctx); + if (bi_has_arg(I, index)) { + b.cursor = bi_before_instr(I); + bi_index tmp = bi_temp(ctx); - unsigned bits = bi_count_read_index(I, index) * 32; - bi_rewrite_index_src_single(I, index, tmp); + unsigned bits = bi_count_read_index(I, index) * 32; + bi_rewrite_index_src_single(I, index, tmp); - bi_instr *ld = bi_load_tl(&b, bits, tmp, offset); - ld->no_spill = true; - ctx->fills++; - } - } + bi_instr *ld = bi_load_tl(&b, bits, tmp, offset); + ld->no_spill = true; + ctx->fills++; + } + } - return (channels * 4); + return (channels * 4); } /* @@ -770,78 +782,79 @@ bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset) static void bi_lower_vector(bi_context *ctx, unsigned first_reg) { - bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index)); + bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index)); - bi_foreach_instr_global_safe(ctx, I) { - bi_builder b = bi_init_builder(ctx, bi_after_instr(I)); + bi_foreach_instr_global_safe(ctx, I) { + bi_builder b = bi_init_builder(ctx, bi_after_instr(I)); - if (I->op == BI_OPCODE_SPLIT_I32) { - bi_index src = I->src[0]; - assert(src.offset == 0); + if (I->op == BI_OPCODE_SPLIT_I32) { + bi_index src = I->src[0]; + assert(src.offset == 0); - bi_foreach_dest(I, i) { - src.offset = i; - bi_mov_i32_to(&b, I->dest[i], src); + bi_foreach_dest(I, i) { + src.offset = i; + bi_mov_i32_to(&b, I->dest[i], src); - if (I->dest[i].value < first_reg) - remap[I->dest[i].value] = src; - } + if (I->dest[i].value < first_reg) + remap[I->dest[i].value] = src; + } - bi_remove_instruction(I); - } else if (I->op == BI_OPCODE_COLLECT_I32) { - bi_index dest = I->dest[0]; - assert(dest.offset == 0); - assert(((dest.value < first_reg) || I->nr_srcs == 1) && "nir_lower_phis_to_scalar"); + bi_remove_instruction(I); + } else if (I->op == BI_OPCODE_COLLECT_I32) { + bi_index dest = I->dest[0]; + assert(dest.offset == 0); + assert(((dest.value < first_reg) || I->nr_srcs == 1) && + "nir_lower_phis_to_scalar"); - bi_foreach_src(I, i) { - if (bi_is_null(I->src[i])) - continue; + bi_foreach_src(I, i) { + if (bi_is_null(I->src[i])) + continue; - dest.offset = i; - bi_mov_i32_to(&b, dest, I->src[i]); - } + dest.offset = i; + bi_mov_i32_to(&b, dest, I->src[i]); + } - bi_remove_instruction(I); - } - } + bi_remove_instruction(I); + } + } - bi_foreach_instr_global(ctx, I) { - bi_foreach_ssa_src(I, s) { - if (I->src[s].value < first_reg && !bi_is_null(remap[I->src[s].value])) - bi_replace_src(I, s, remap[I->src[s].value]); - } - } + bi_foreach_instr_global(ctx, I) { + bi_foreach_ssa_src(I, s) { + if (I->src[s].value < first_reg && !bi_is_null(remap[I->src[s].value])) + bi_replace_src(I, s, remap[I->src[s].value]); + } + } - free(remap); + free(remap); - /* After generating a pile of moves, clean up */ - bi_compute_liveness_ra(ctx); + /* After generating a pile of moves, clean up */ + bi_compute_liveness_ra(ctx); - bi_foreach_block_rev(ctx, block) { - uint8_t *live = rzalloc_array(block, uint8_t, ctx->ssa_alloc); + bi_foreach_block_rev(ctx, block) { + uint8_t *live = rzalloc_array(block, uint8_t, ctx->ssa_alloc); - bi_foreach_successor(block, succ) { - for (unsigned i = 0; i < ctx->ssa_alloc; ++i) - live[i] |= succ->live_in[i]; - } + bi_foreach_successor(block, succ) { + for (unsigned i = 0; i < ctx->ssa_alloc; ++i) + live[i] |= succ->live_in[i]; + } - bi_foreach_instr_in_block_safe_rev(block, ins) { - bool all_null = true; + bi_foreach_instr_in_block_safe_rev(block, ins) { + bool all_null = true; - bi_foreach_dest(ins, d) { - if (live[ins->dest[d].value] & bi_writemask(ins, d)) - all_null = false; - } + bi_foreach_dest(ins, d) { + if (live[ins->dest[d].value] & bi_writemask(ins, d)) + all_null = false; + } - if (all_null && !bi_side_effects(ins)) - bi_remove_instruction(ins); - else - bi_liveness_ins_update_ra(live, ins); - } + if (all_null && !bi_side_effects(ins)) + bi_remove_instruction(ins); + else + bi_liveness_ins_update_ra(live, ins); + } - ralloc_free(block->live_in); - block->live_in = live; - } + ralloc_free(block->live_in); + block->live_in = live; + } } /* @@ -855,12 +868,10 @@ bi_lower_vector(bi_context *ctx, unsigned first_reg) static bool bi_is_tied(const bi_instr *I) { - return (I->op == BI_OPCODE_TEXC || - I->op == BI_OPCODE_TEXC_DUAL || - I->op == BI_OPCODE_ATOM_RETURN_I32 || - I->op == BI_OPCODE_AXCHG_I32 || - I->op == BI_OPCODE_ACMPXCHG_I32) && - !bi_is_null(I->src[0]); + return (I->op == BI_OPCODE_TEXC || I->op == BI_OPCODE_TEXC_DUAL || + I->op == BI_OPCODE_ATOM_RETURN_I32 || I->op == BI_OPCODE_AXCHG_I32 || + I->op == BI_OPCODE_ACMPXCHG_I32) && + !bi_is_null(I->src[0]); } /* @@ -872,33 +883,34 @@ bi_is_tied(const bi_instr *I) static void bi_coalesce_tied(bi_context *ctx) { - bi_foreach_instr_global(ctx, I) { - if (!bi_is_tied(I)) continue; + bi_foreach_instr_global(ctx, I) { + if (!bi_is_tied(I)) + continue; - bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - unsigned n = bi_count_read_registers(I, 0); + bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); + unsigned n = bi_count_read_registers(I, 0); - for (unsigned i = 0; i < n; ++i) { - bi_index dst = I->dest[0], src = I->src[0]; + for (unsigned i = 0; i < n; ++i) { + bi_index dst = I->dest[0], src = I->src[0]; - assert(dst.offset == 0 && src.offset == 0); - dst.offset = src.offset = i; + assert(dst.offset == 0 && src.offset == 0); + dst.offset = src.offset = i; - bi_mov_i32_to(&b, dst, src); - } + bi_mov_i32_to(&b, dst, src); + } - bi_replace_src(I, 0, I->dest[0]); - } + bi_replace_src(I, 0, I->dest[0]); + } } static unsigned find_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc) { - if (!map[value]) - map[value] = ++(*alloc); + if (!map[value]) + map[value] = ++(*alloc); - assert(map[value]); - return map[value] - 1; + assert(map[value]); + return map[value] - 1; } /* Reassigns numbering to get rid of gaps in the indices and to prioritize @@ -907,18 +919,20 @@ find_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc) static void squeeze_index(bi_context *ctx) { - unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc); - ctx->ssa_alloc = 0; + unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc); + ctx->ssa_alloc = 0; - bi_foreach_instr_global(ctx, I) { - bi_foreach_dest(I, d) - I->dest[d].value = find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc); + bi_foreach_instr_global(ctx, I) { + bi_foreach_dest(I, d) + I->dest[d].value = + find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc); - bi_foreach_ssa_src(I, s) - I->src[s].value = find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc); - } + bi_foreach_ssa_src(I, s) + I->src[s].value = + find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc); + } - ralloc_free(map); + ralloc_free(map); } /* @@ -929,203 +943,211 @@ squeeze_index(bi_context *ctx) static unsigned bi_out_of_ssa(bi_context *ctx) { - bi_index zero = bi_fau(BIR_FAU_IMMEDIATE | 0, false); - unsigned first_reg = ctx->ssa_alloc; + bi_index zero = bi_fau(BIR_FAU_IMMEDIATE | 0, false); + unsigned first_reg = ctx->ssa_alloc; - /* Trivially lower phis */ - bi_foreach_block(ctx, block) { - bi_foreach_instr_in_block_safe(block, I) { - if (I->op != BI_OPCODE_PHI) - break; + /* Trivially lower phis */ + bi_foreach_block(ctx, block) { + bi_foreach_instr_in_block_safe(block, I) { + if (I->op != BI_OPCODE_PHI) + break; - /* Assign a register for the phi */ - bi_index reg = bi_temp(ctx); - assert(reg.value >= first_reg); + /* Assign a register for the phi */ + bi_index reg = bi_temp(ctx); + assert(reg.value >= first_reg); - /* Lower to a move in each predecessor. The destinations - * cannot interfere so these can be sequentialized - * in arbitrary order. - */ - bi_foreach_predecessor(block, pred) { - bi_builder b = bi_init_builder(ctx, bi_after_block_logical(*pred)); - unsigned i = bi_predecessor_index(block, *pred); + /* Lower to a move in each predecessor. The destinations + * cannot interfere so these can be sequentialized + * in arbitrary order. + */ + bi_foreach_predecessor(block, pred) { + bi_builder b = bi_init_builder(ctx, bi_after_block_logical(*pred)); + unsigned i = bi_predecessor_index(block, *pred); - assert(!I->src[i].abs); - assert(!I->src[i].neg); - assert(I->src[i].swizzle == BI_SWIZZLE_H01); + assert(!I->src[i].abs); + assert(!I->src[i].neg); + assert(I->src[i].swizzle == BI_SWIZZLE_H01); - /* MOV of immediate needs lowering on Valhall */ - if (ctx->arch >= 9 && I->src[i].type == BI_INDEX_CONSTANT) - bi_iadd_imm_i32_to(&b, reg, zero, I->src[i].value); - else - bi_mov_i32_to(&b, reg, I->src[i]); - } + /* MOV of immediate needs lowering on Valhall */ + if (ctx->arch >= 9 && I->src[i].type == BI_INDEX_CONSTANT) + bi_iadd_imm_i32_to(&b, reg, zero, I->src[i].value); + else + bi_mov_i32_to(&b, reg, I->src[i]); + } - /* Replace the phi with a move */ - bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - bi_mov_i32_to(&b, I->dest[0], reg); - bi_remove_instruction(I); + /* Replace the phi with a move */ + bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); + bi_mov_i32_to(&b, I->dest[0], reg); + bi_remove_instruction(I); - /* Propagate that move within the block. The destination - * is SSA and the source is not written in this block, - * so this is legal. The move itself will be DCE'd if - * possible in the next pass. - */ - bi_foreach_instr_in_block_rev(block, prop) { - if (prop->op == BI_OPCODE_PHI) - break; + /* Propagate that move within the block. The destination + * is SSA and the source is not written in this block, + * so this is legal. The move itself will be DCE'd if + * possible in the next pass. + */ + bi_foreach_instr_in_block_rev(block, prop) { + if (prop->op == BI_OPCODE_PHI) + break; - bi_foreach_src(prop, s) { - if (bi_is_equiv(prop->src[s], I->dest[0])) { - bi_replace_src(prop, s, reg); - } - } - } - } - } + bi_foreach_src(prop, s) { + if (bi_is_equiv(prop->src[s], I->dest[0])) { + bi_replace_src(prop, s, reg); + } + } + } + } + } - /* Try to locally propagate the moves we created. We need to be extra - * careful because we're not in SSA at this point, as such this - * algorithm is quadratic. This will go away when we go out of SSA after - * RA. - */ - BITSET_WORD *used = calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc)); - BITSET_WORD *multiple_uses = calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc)); + /* Try to locally propagate the moves we created. We need to be extra + * careful because we're not in SSA at this point, as such this + * algorithm is quadratic. This will go away when we go out of SSA after + * RA. + */ + BITSET_WORD *used = + calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc)); + BITSET_WORD *multiple_uses = + calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc)); - bi_foreach_instr_global(ctx, I) { - bi_foreach_ssa_src(I, s) { - if (BITSET_TEST(used, I->src[s].value)) - BITSET_SET(multiple_uses, I->src[s].value); - else - BITSET_SET(used, I->src[s].value); - } - } + bi_foreach_instr_global(ctx, I) { + bi_foreach_ssa_src(I, s) { + if (BITSET_TEST(used, I->src[s].value)) + BITSET_SET(multiple_uses, I->src[s].value); + else + BITSET_SET(used, I->src[s].value); + } + } - bi_foreach_block(ctx, block) { - bi_foreach_instr_in_block_safe_rev(block, mov) { - /* Match "reg = ssa" */ - if (mov->op != BI_OPCODE_MOV_I32) continue; - if (mov->dest[0].type != BI_INDEX_NORMAL) continue; - if (mov->dest[0].value < first_reg) continue; - if (!bi_is_ssa(mov->src[0])) continue; - if (mov->src[0].value >= first_reg) continue; - if (BITSET_TEST(multiple_uses, mov->src[0].value)) continue; + bi_foreach_block(ctx, block) { + bi_foreach_instr_in_block_safe_rev(block, mov) { + /* Match "reg = ssa" */ + if (mov->op != BI_OPCODE_MOV_I32) + continue; + if (mov->dest[0].type != BI_INDEX_NORMAL) + continue; + if (mov->dest[0].value < first_reg) + continue; + if (!bi_is_ssa(mov->src[0])) + continue; + if (mov->src[0].value >= first_reg) + continue; + if (BITSET_TEST(multiple_uses, mov->src[0].value)) + continue; - bool found = false; + bool found = false; - /* Look locally for the write of the SSA */ - bi_foreach_instr_in_block_rev(block, I) { - bool bail = false; + /* Look locally for the write of the SSA */ + bi_foreach_instr_in_block_rev(block, I) { + bool bail = false; - bi_foreach_src(I, s) { - /* Bail: write-after-read */ - if (bi_is_equiv(I->src[s], mov->dest[0])) - bail = true; - } + bi_foreach_src(I, s) { + /* Bail: write-after-read */ + if (bi_is_equiv(I->src[s], mov->dest[0])) + bail = true; + } - if (bail) - break; + if (bail) + break; - bi_foreach_dest(I, d) { - /* Bail: write-after-write */ - if (bi_is_equiv(I->dest[d], mov->dest[0])) - break; + bi_foreach_dest(I, d) { + /* Bail: write-after-write */ + if (bi_is_equiv(I->dest[d], mov->dest[0])) + break; - if (!bi_is_equiv(I->dest[d], mov->src[0])) - continue; + if (!bi_is_equiv(I->dest[d], mov->src[0])) + continue; - /* We found it, replace */ - I->dest[d] = bi_replace_index(I->dest[d], mov->dest[0]); - found = true; - break; - } + /* We found it, replace */ + I->dest[d] = bi_replace_index(I->dest[d], mov->dest[0]); + found = true; + break; + } - if (found) - break; - } + if (found) + break; + } - if (found) - bi_remove_instruction(mov); - } - } + if (found) + bi_remove_instruction(mov); + } + } - free(used); - free(multiple_uses); - return first_reg; + free(used); + free(multiple_uses); + return first_reg; } void bi_register_allocate(bi_context *ctx) { - struct lcra_state *l = NULL; - bool success = false; + struct lcra_state *l = NULL; + bool success = false; - unsigned iter_count = 1000; /* max iterations */ + unsigned iter_count = 1000; /* max iterations */ - /* Number of bytes of memory we've spilled into */ - unsigned spill_count = ctx->info.tls_size; + /* Number of bytes of memory we've spilled into */ + unsigned spill_count = ctx->info.tls_size; - if (ctx->arch >= 9) - va_lower_split_64bit(ctx); + if (ctx->arch >= 9) + va_lower_split_64bit(ctx); - /* Lower tied operands. SSA is broken from here on. */ - unsigned first_reg = bi_out_of_ssa(ctx); - bi_lower_vector(ctx, first_reg); - bi_coalesce_tied(ctx); - squeeze_index(ctx); + /* Lower tied operands. SSA is broken from here on. */ + unsigned first_reg = bi_out_of_ssa(ctx); + bi_lower_vector(ctx, first_reg); + bi_coalesce_tied(ctx); + squeeze_index(ctx); - /* Try with reduced register pressure to improve thread count */ - if (ctx->arch >= 7) { - l = bi_allocate_registers(ctx, &success, false); + /* Try with reduced register pressure to improve thread count */ + if (ctx->arch >= 7) { + l = bi_allocate_registers(ctx, &success, false); - if (success) { - ctx->info.work_reg_count = 32; - } else { - lcra_free(l); - l = NULL; - } - } + if (success) { + ctx->info.work_reg_count = 32; + } else { + lcra_free(l); + l = NULL; + } + } - /* Otherwise, use the register file and spill until we succeed */ - while (!success && ((iter_count--) > 0)) { - l = bi_allocate_registers(ctx, &success, true); + /* Otherwise, use the register file and spill until we succeed */ + while (!success && ((iter_count--) > 0)) { + l = bi_allocate_registers(ctx, &success, true); - if (success) { - ctx->info.work_reg_count = 64; - } else { - signed spill_node = bi_choose_spill_node(ctx, l); - lcra_free(l); - l = NULL; + if (success) { + ctx->info.work_reg_count = 64; + } else { + signed spill_node = bi_choose_spill_node(ctx, l); + lcra_free(l); + l = NULL; - if (spill_node == -1) - unreachable("Failed to choose spill node\n"); + if (spill_node == -1) + unreachable("Failed to choose spill node\n"); - if (ctx->inputs->is_blend) - unreachable("Blend shaders may not spill"); + if (ctx->inputs->is_blend) + unreachable("Blend shaders may not spill"); - /* By default, we use packed TLS addressing on Valhall. - * We cannot cross 16 byte boundaries with packed TLS - * addressing. Align to ensure this doesn't happen. This - * could be optimized a bit. - */ - if (ctx->arch >= 9) - spill_count = ALIGN_POT(spill_count, 16); + /* By default, we use packed TLS addressing on Valhall. + * We cannot cross 16 byte boundaries with packed TLS + * addressing. Align to ensure this doesn't happen. This + * could be optimized a bit. + */ + if (ctx->arch >= 9) + spill_count = ALIGN_POT(spill_count, 16); - spill_count += bi_spill_register(ctx, - bi_get_index(spill_node), spill_count); + spill_count += + bi_spill_register(ctx, bi_get_index(spill_node), spill_count); - /* In case the spill affected an instruction with tied - * operands, we need to fix up. - */ - bi_coalesce_tied(ctx); - } - } + /* In case the spill affected an instruction with tied + * operands, we need to fix up. + */ + bi_coalesce_tied(ctx); + } + } - assert(success); - assert(l != NULL); + assert(success); + assert(l != NULL); - ctx->info.tls_size = spill_count; - bi_install_registers(ctx, l); + ctx->info.tls_size = spill_count; + bi_install_registers(ctx, l); - lcra_free(l); + lcra_free(l); } diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c index 877d57aaadf..f55e364f928 100644 --- a/src/panfrost/bifrost/bi_schedule.c +++ b/src/panfrost/bifrost/bi_schedule.c @@ -24,120 +24,120 @@ * Alyssa Rosenzweig */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" /* Arguments common to worklist, passed by value for convenience */ struct bi_worklist { - /* # of instructions in the block */ - unsigned count; + /* # of instructions in the block */ + unsigned count; - /* Instructions in the block */ - bi_instr **instructions; + /* Instructions in the block */ + bi_instr **instructions; - /* Bitset of instructions in the block ready for scheduling */ - BITSET_WORD *worklist; + /* Bitset of instructions in the block ready for scheduling */ + BITSET_WORD *worklist; - /* The backwards dependency graph. nr_dependencies is the number of - * unscheduled instructions that must still be scheduled after (before) - * this instruction. dependents are which instructions need to be - * scheduled before (after) this instruction. */ - unsigned *dep_counts; - BITSET_WORD **dependents; + /* The backwards dependency graph. nr_dependencies is the number of + * unscheduled instructions that must still be scheduled after (before) + * this instruction. dependents are which instructions need to be + * scheduled before (after) this instruction. */ + unsigned *dep_counts; + BITSET_WORD **dependents; }; /* State of a single tuple and clause under construction */ struct bi_reg_state { - /* Number of register writes */ - unsigned nr_writes; + /* Number of register writes */ + unsigned nr_writes; - /* Register reads, expressed as (equivalence classes of) - * sources. Only 3 reads are allowed, but up to 2 may spill as - * "forced" for the next scheduled tuple, provided such a tuple - * can be constructed */ - bi_index reads[5]; - unsigned nr_reads; + /* Register reads, expressed as (equivalence classes of) + * sources. Only 3 reads are allowed, but up to 2 may spill as + * "forced" for the next scheduled tuple, provided such a tuple + * can be constructed */ + bi_index reads[5]; + unsigned nr_reads; - /* The previous tuple scheduled (= the next tuple executed in the - * program) may require certain writes, in order to bypass the register - * file and use a temporary passthrough for the value. Up to 2 such - * constraints are architecturally satisfiable */ - unsigned forced_count; - bi_index forceds[2]; + /* The previous tuple scheduled (= the next tuple executed in the + * program) may require certain writes, in order to bypass the register + * file and use a temporary passthrough for the value. Up to 2 such + * constraints are architecturally satisfiable */ + unsigned forced_count; + bi_index forceds[2]; }; struct bi_tuple_state { - /* Is this the last tuple in the clause */ - bool last; + /* Is this the last tuple in the clause */ + bool last; - /* Scheduled ADD instruction, or null if none */ - bi_instr *add; + /* Scheduled ADD instruction, or null if none */ + bi_instr *add; - /* Reads for previous (succeeding) tuple */ - bi_index prev_reads[5]; - unsigned nr_prev_reads; - bi_tuple *prev; + /* Reads for previous (succeeding) tuple */ + bi_index prev_reads[5]; + unsigned nr_prev_reads; + bi_tuple *prev; - /* Register slot state for current tuple */ - struct bi_reg_state reg; + /* Register slot state for current tuple */ + struct bi_reg_state reg; - /* Constants are shared in the tuple. If constant_count is nonzero, it - * is a size for constant count. Otherwise, fau is the slot read from - * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero, - * but within a tuple, that should be encoded as constant_count != 0 - * and constants[0] = constants[1] = 0 */ - unsigned constant_count; + /* Constants are shared in the tuple. If constant_count is nonzero, it + * is a size for constant count. Otherwise, fau is the slot read from + * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero, + * but within a tuple, that should be encoded as constant_count != 0 + * and constants[0] = constants[1] = 0 */ + unsigned constant_count; - union { - uint32_t constants[2]; - enum bir_fau fau; - }; + union { + uint32_t constants[2]; + enum bir_fau fau; + }; - unsigned pcrel_idx; + unsigned pcrel_idx; }; struct bi_const_state { - unsigned constant_count; - bool pcrel; /* applies to first const */ - uint32_t constants[2]; + unsigned constant_count; + bool pcrel; /* applies to first const */ + uint32_t constants[2]; - /* Index of the constant into the clause */ - unsigned word_idx; + /* Index of the constant into the clause */ + unsigned word_idx; }; enum bi_ftz_state { - /* No flush-to-zero state assigned yet */ - BI_FTZ_STATE_NONE, + /* No flush-to-zero state assigned yet */ + BI_FTZ_STATE_NONE, - /* Never flush-to-zero */ - BI_FTZ_STATE_DISABLE, + /* Never flush-to-zero */ + BI_FTZ_STATE_DISABLE, - /* Always flush-to-zero */ - BI_FTZ_STATE_ENABLE, + /* Always flush-to-zero */ + BI_FTZ_STATE_ENABLE, }; /* At this point, pseudoinstructions have been lowered so sources/destinations * are limited to what's physically supported. */ -#define BI_MAX_PHYS_SRCS 4 +#define BI_MAX_PHYS_SRCS 4 #define BI_MAX_PHYS_DESTS 2 struct bi_clause_state { - /* Has a message-passing instruction already been assigned? */ - bool message; + /* Has a message-passing instruction already been assigned? */ + bool message; - /* Indices already accessed, this needs to be tracked to avoid hazards - * around message-passing instructions */ - unsigned access_count; - bi_index accesses[(BI_MAX_PHYS_SRCS + BI_MAX_PHYS_DESTS) * 16]; + /* Indices already accessed, this needs to be tracked to avoid hazards + * around message-passing instructions */ + unsigned access_count; + bi_index accesses[(BI_MAX_PHYS_SRCS + BI_MAX_PHYS_DESTS) * 16]; - unsigned tuple_count; - struct bi_const_state consts[8]; + unsigned tuple_count; + struct bi_const_state consts[8]; - /* Numerical state of the clause */ - enum bi_ftz_state ftz; + /* Numerical state of the clause */ + enum bi_ftz_state ftz; }; /* Determines messsage type by checking the table and a few special cases. Only @@ -148,16 +148,16 @@ struct bi_clause_state { static enum bifrost_message_type bi_message_type_for_instr(bi_instr *ins) { - enum bifrost_message_type msg = bi_opcode_props[ins->op].message; - bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL); + enum bifrost_message_type msg = bi_opcode_props[ins->op].message; + bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL); - if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z) - return BIFROST_MESSAGE_Z_STENCIL; + if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z) + return BIFROST_MESSAGE_Z_STENCIL; - if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO) - return BIFROST_MESSAGE_ATTRIBUTE; + if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO) + return BIFROST_MESSAGE_ATTRIBUTE; - return msg; + return msg; } /* Attribute, texture, and UBO load (attribute message) instructions support @@ -166,157 +166,162 @@ bi_message_type_for_instr(bi_instr *ins) ASSERTED static bool bi_supports_dtsel(bi_instr *ins) { - switch (bi_message_type_for_instr(ins)) { - case BIFROST_MESSAGE_ATTRIBUTE: - return ins->op != BI_OPCODE_LD_GCLK_U64; - case BIFROST_MESSAGE_TEX: - return true; - default: - return false; - } + switch (bi_message_type_for_instr(ins)) { + case BIFROST_MESSAGE_ATTRIBUTE: + return ins->op != BI_OPCODE_LD_GCLK_U64; + case BIFROST_MESSAGE_TEX: + return true; + default: + return false; + } } /* Adds an edge to the dependency graph */ static void -bi_push_dependency(unsigned parent, unsigned child, - BITSET_WORD **dependents, unsigned *dep_counts) +bi_push_dependency(unsigned parent, unsigned child, BITSET_WORD **dependents, + unsigned *dep_counts) { - if (!BITSET_TEST(dependents[parent], child)) { - BITSET_SET(dependents[parent], child); - dep_counts[child]++; - } + if (!BITSET_TEST(dependents[parent], child)) { + BITSET_SET(dependents[parent], child); + dep_counts[child]++; + } } static void add_dependency(struct util_dynarray *table, unsigned index, unsigned child, - BITSET_WORD **dependents, unsigned *dep_counts) + BITSET_WORD **dependents, unsigned *dep_counts) { - assert(index < 64); - util_dynarray_foreach(table + index, unsigned, parent) - bi_push_dependency(*parent, child, dependents, dep_counts); + assert(index < 64); + util_dynarray_foreach(table + index, unsigned, parent) + bi_push_dependency(*parent, child, dependents, dep_counts); } static void mark_access(struct util_dynarray *table, unsigned index, unsigned parent) { - assert(index < 64); - util_dynarray_append(&table[index], unsigned, parent); + assert(index < 64); + util_dynarray_append(&table[index], unsigned, parent); } static bool bi_is_sched_barrier(bi_instr *I) { - switch (I->op) { - case BI_OPCODE_BARRIER: - case BI_OPCODE_DISCARD_F32: - return true; - default: - return false; - } + switch (I->op) { + case BI_OPCODE_BARRIER: + case BI_OPCODE_DISCARD_F32: + return true; + default: + return false; + } } static void bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend) { - struct util_dynarray last_read[64], last_write[64]; + struct util_dynarray last_read[64], last_write[64]; - for (unsigned i = 0; i < 64; ++i) { - util_dynarray_init(&last_read[i], NULL); - util_dynarray_init(&last_write[i], NULL); - } + for (unsigned i = 0; i < 64; ++i) { + util_dynarray_init(&last_read[i], NULL); + util_dynarray_init(&last_write[i], NULL); + } - /* Initialize dependency graph */ - for (unsigned i = 0; i < st.count; ++i) { - st.dependents[i] = - calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); + /* Initialize dependency graph */ + for (unsigned i = 0; i < st.count; ++i) { + st.dependents[i] = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); - st.dep_counts[i] = 0; - } + st.dep_counts[i] = 0; + } - unsigned prev_msg = ~0; + unsigned prev_msg = ~0; - /* Populate dependency graph */ - for (signed i = st.count - 1; i >= 0; --i) { - bi_instr *ins = st.instructions[i]; + /* Populate dependency graph */ + for (signed i = st.count - 1; i >= 0; --i) { + bi_instr *ins = st.instructions[i]; - bi_foreach_src(ins, s) { - if (ins->src[s].type != BI_INDEX_REGISTER) continue; - unsigned count = bi_count_read_registers(ins, s); + bi_foreach_src(ins, s) { + if (ins->src[s].type != BI_INDEX_REGISTER) + continue; + unsigned count = bi_count_read_registers(ins, s); - for (unsigned c = 0; c < count; ++c) - add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts); - } + for (unsigned c = 0; c < count; ++c) + add_dependency(last_write, ins->src[s].value + c, i, st.dependents, + st.dep_counts); + } - /* Keep message-passing ops in order. (This pass only cares - * about bundling; reordering of message-passing instructions - * happens during earlier scheduling.) */ + /* Keep message-passing ops in order. (This pass only cares + * about bundling; reordering of message-passing instructions + * happens during earlier scheduling.) */ - if (bi_message_type_for_instr(ins)) { - if (prev_msg != ~0) - bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts); + if (bi_message_type_for_instr(ins)) { + if (prev_msg != ~0) + bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts); - prev_msg = i; - } + prev_msg = i; + } - /* Handle schedule barriers, adding All the deps */ - if (inorder || bi_is_sched_barrier(ins)) { - for (unsigned j = 0; j < st.count; ++j) { - if (i == j) continue; + /* Handle schedule barriers, adding All the deps */ + if (inorder || bi_is_sched_barrier(ins)) { + for (unsigned j = 0; j < st.count; ++j) { + if (i == j) + continue; - bi_push_dependency(MAX2(i, j), MIN2(i, j), - st.dependents, st.dep_counts); - } - } + bi_push_dependency(MAX2(i, j), MIN2(i, j), st.dependents, + st.dep_counts); + } + } - bi_foreach_dest(ins, d) { - assert(ins->dest[d].type == BI_INDEX_REGISTER); - unsigned dest = ins->dest[d].value; + bi_foreach_dest(ins, d) { + assert(ins->dest[d].type == BI_INDEX_REGISTER); + unsigned dest = ins->dest[d].value; - unsigned count = bi_count_write_registers(ins, d); + unsigned count = bi_count_write_registers(ins, d); - for (unsigned c = 0; c < count; ++c) { - add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts); - add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts); - mark_access(last_write, dest + c, i); - } - } + for (unsigned c = 0; c < count; ++c) { + add_dependency(last_read, dest + c, i, st.dependents, + st.dep_counts); + add_dependency(last_write, dest + c, i, st.dependents, + st.dep_counts); + mark_access(last_write, dest + c, i); + } + } - /* Blend shaders are allowed to clobber R0-R15. Treat these - * registers like extra destinations for scheduling purposes. - */ - if (ins->op == BI_OPCODE_BLEND && !is_blend) { - for (unsigned c = 0; c < 16; ++c) { - add_dependency(last_read, c, i, st.dependents, st.dep_counts); - add_dependency(last_write, c, i, st.dependents, st.dep_counts); - mark_access(last_write, c, i); - } - } + /* Blend shaders are allowed to clobber R0-R15. Treat these + * registers like extra destinations for scheduling purposes. + */ + if (ins->op == BI_OPCODE_BLEND && !is_blend) { + for (unsigned c = 0; c < 16; ++c) { + add_dependency(last_read, c, i, st.dependents, st.dep_counts); + add_dependency(last_write, c, i, st.dependents, st.dep_counts); + mark_access(last_write, c, i); + } + } - bi_foreach_src(ins, s) { - if (ins->src[s].type != BI_INDEX_REGISTER) continue; + bi_foreach_src(ins, s) { + if (ins->src[s].type != BI_INDEX_REGISTER) + continue; - unsigned count = bi_count_read_registers(ins, s); + unsigned count = bi_count_read_registers(ins, s); - for (unsigned c = 0; c < count; ++c) - mark_access(last_read, ins->src[s].value + c, i); - } - } + for (unsigned c = 0; c < count; ++c) + mark_access(last_read, ins->src[s].value + c, i); + } + } - /* If there is a branch, all instructions depend on it, as interblock - * execution must be purely in-order */ + /* If there is a branch, all instructions depend on it, as interblock + * execution must be purely in-order */ - bi_instr *last = st.instructions[st.count - 1]; - if (last->branch_target || last->op == BI_OPCODE_JUMP) { - for (signed i = st.count - 2; i >= 0; --i) - bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts); - } + bi_instr *last = st.instructions[st.count - 1]; + if (last->branch_target || last->op == BI_OPCODE_JUMP) { + for (signed i = st.count - 2; i >= 0; --i) + bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts); + } - /* Free the intermediate structures */ - for (unsigned i = 0; i < 64; ++i) { - util_dynarray_fini(&last_read[i]); - util_dynarray_fini(&last_write[i]); - } + /* Free the intermediate structures */ + for (unsigned i = 0; i < 64; ++i) { + util_dynarray_fini(&last_read[i]); + util_dynarray_fini(&last_write[i]); + } } /* Scheduler pseudoinstruction lowerings to enable instruction pairings. @@ -324,22 +329,22 @@ bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend) */ static bi_instr * -bi_lower_cubeface(bi_context *ctx, - struct bi_clause_state *clause, struct bi_tuple_state *tuple) +bi_lower_cubeface(bi_context *ctx, struct bi_clause_state *clause, + struct bi_tuple_state *tuple) { - bi_instr *pinstr = tuple->add; - bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); - bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0], - pinstr->src[0], pinstr->src[1], pinstr->src[2]); + bi_instr *pinstr = tuple->add; + bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); + bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0], pinstr->src[0], + pinstr->src[1], pinstr->src[2]); - pinstr->op = BI_OPCODE_CUBEFACE2; - pinstr->dest[0] = pinstr->dest[1]; - bi_drop_dests(pinstr, 1); + pinstr->op = BI_OPCODE_CUBEFACE2; + pinstr->dest[0] = pinstr->dest[1]; + bi_drop_dests(pinstr, 1); - pinstr->src[0] = cubeface1->dest[0]; - bi_drop_srcs(pinstr, 1); + pinstr->src[0] = cubeface1->dest[0]; + bi_drop_srcs(pinstr, 1); - return cubeface1; + return cubeface1; } /* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to @@ -347,83 +352,81 @@ bi_lower_cubeface(bi_context *ctx, * arguments (rbase, address lo, address hi, rbase) */ static bi_instr * -bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct - bi_tuple_state *tuple) +bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, + struct bi_tuple_state *tuple) { - bi_instr *pinstr = tuple->add; - bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); - bi_instr *atom_c = bi_atom_c_return_i32(&b, - pinstr->src[1], pinstr->src[2], pinstr->src[0], - pinstr->atom_opc); + bi_instr *pinstr = tuple->add; + bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); + bi_instr *atom_c = bi_atom_c_return_i32(&b, pinstr->src[1], pinstr->src[2], + pinstr->src[0], pinstr->atom_opc); - if (bi_is_null(pinstr->dest[0])) - atom_c->op = BI_OPCODE_ATOM_C_I32; + if (bi_is_null(pinstr->dest[0])) + atom_c->op = BI_OPCODE_ATOM_C_I32; - bi_instr *atom_cx = bi_atom_cx_to(&b, pinstr->dest[0], pinstr->src[0], - pinstr->src[1], pinstr->src[2], pinstr->src[0], - pinstr->sr_count); - tuple->add = atom_cx; - bi_remove_instruction(pinstr); + bi_instr *atom_cx = + bi_atom_cx_to(&b, pinstr->dest[0], pinstr->src[0], pinstr->src[1], + pinstr->src[2], pinstr->src[0], pinstr->sr_count); + tuple->add = atom_cx; + bi_remove_instruction(pinstr); - return atom_c; + return atom_c; } static bi_instr * -bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct - bi_tuple_state *tuple) +bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, + struct bi_tuple_state *tuple) { - bi_instr *pinstr = tuple->add; - bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); - bi_instr *atom_c = bi_atom_c1_return_i32(&b, - pinstr->src[0], pinstr->src[1], pinstr->atom_opc); + bi_instr *pinstr = tuple->add; + bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); + bi_instr *atom_c = bi_atom_c1_return_i32(&b, pinstr->src[0], pinstr->src[1], + pinstr->atom_opc); - if (bi_is_null(pinstr->dest[0])) - atom_c->op = BI_OPCODE_ATOM_C1_I32; + if (bi_is_null(pinstr->dest[0])) + atom_c->op = BI_OPCODE_ATOM_C1_I32; + bi_instr *atom_cx = + bi_atom_cx_to(&b, pinstr->dest[0], bi_null(), pinstr->src[0], + pinstr->src[1], bi_dontcare(&b), pinstr->sr_count); + tuple->add = atom_cx; + bi_remove_instruction(pinstr); - bi_instr *atom_cx = bi_atom_cx_to(&b, pinstr->dest[0], bi_null(), - pinstr->src[0], pinstr->src[1], bi_dontcare(&b), - pinstr->sr_count); - tuple->add = atom_cx; - bi_remove_instruction(pinstr); - - return atom_c; + return atom_c; } static bi_instr * -bi_lower_seg_add(bi_context *ctx, - struct bi_clause_state *clause, struct bi_tuple_state *tuple) +bi_lower_seg_add(bi_context *ctx, struct bi_clause_state *clause, + struct bi_tuple_state *tuple) { - bi_instr *pinstr = tuple->add; - bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); + bi_instr *pinstr = tuple->add; + bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); - bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0], - pinstr->preserve_null, pinstr->seg); + bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0], + pinstr->preserve_null, pinstr->seg); - pinstr->op = BI_OPCODE_SEG_ADD; - pinstr->src[0] = pinstr->src[1]; - bi_drop_srcs(pinstr, 1); + pinstr->op = BI_OPCODE_SEG_ADD; + pinstr->src[0] = pinstr->src[1]; + bi_drop_srcs(pinstr, 1); - assert(pinstr->dest[0].type == BI_INDEX_REGISTER); - pinstr->dest[0].value += 1; + assert(pinstr->dest[0].type == BI_INDEX_REGISTER); + pinstr->dest[0].value += 1; - return fma; + return fma; } static bi_instr * -bi_lower_dtsel(bi_context *ctx, - struct bi_clause_state *clause, struct bi_tuple_state *tuple) +bi_lower_dtsel(bi_context *ctx, struct bi_clause_state *clause, + struct bi_tuple_state *tuple) { - bi_instr *add = tuple->add; - bi_builder b = bi_init_builder(ctx, bi_before_instr(add)); + bi_instr *add = tuple->add; + bi_builder b = bi_init_builder(ctx, bi_before_instr(add)); - bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader), - add->src[0], add->table); - assert(add->nr_srcs >= 1); - add->src[0] = dtsel->dest[0]; + bi_instr *dtsel = + bi_dtsel_imm_to(&b, bi_temp(b.shader), add->src[0], add->table); + assert(add->nr_srcs >= 1); + add->src[0] = dtsel->dest[0]; - assert(bi_supports_dtsel(add)); - return dtsel; + assert(bi_supports_dtsel(add)); + return dtsel; } /* Flatten linked list to array for O(1) indexing */ @@ -431,18 +434,18 @@ bi_lower_dtsel(bi_context *ctx, static bi_instr ** bi_flatten_block(bi_block *block, unsigned *len) { - if (list_is_empty(&block->instructions)) - return NULL; + if (list_is_empty(&block->instructions)) + return NULL; - *len = list_length(&block->instructions); - bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len)); + *len = list_length(&block->instructions); + bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len)); - unsigned i = 0; + unsigned i = 0; - bi_foreach_instr_in_block(block, ins) - instructions[i++] = ins; + bi_foreach_instr_in_block(block, ins) + instructions[i++] = ins; - return instructions; + return instructions; } /* The worklist would track instructions without outstanding dependencies. For @@ -452,56 +455,56 @@ bi_flatten_block(bi_block *block, unsigned *len) static struct bi_worklist bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend) { - struct bi_worklist st = { }; - st.instructions = bi_flatten_block(block, &st.count); + struct bi_worklist st = {}; + st.instructions = bi_flatten_block(block, &st.count); - if (!st.count) - return st; + if (!st.count) + return st; - st.dependents = calloc(st.count, sizeof(st.dependents[0])); - st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0])); + st.dependents = calloc(st.count, sizeof(st.dependents[0])); + st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0])); - bi_create_dependency_graph(st, inorder, is_blend); - st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); + bi_create_dependency_graph(st, inorder, is_blend); + st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); - for (unsigned i = 0; i < st.count; ++i) { - if (st.dep_counts[i] == 0) - BITSET_SET(st.worklist, i); - } + for (unsigned i = 0; i < st.count; ++i) { + if (st.dep_counts[i] == 0) + BITSET_SET(st.worklist, i); + } - return st; + return st; } static void bi_free_worklist(struct bi_worklist st) { - free(st.dep_counts); - free(st.dependents); - free(st.instructions); - free(st.worklist); + free(st.dep_counts); + free(st.dependents); + free(st.instructions); + free(st.worklist); } static void bi_update_worklist(struct bi_worklist st, unsigned idx) { - assert(st.dep_counts[idx] == 0); + assert(st.dep_counts[idx] == 0); - if (!st.dependents[idx]) - return; + if (!st.dependents[idx]) + return; - /* Iterate each dependent to remove one dependency (`done`), - * adding dependents to the worklist where possible. */ + /* Iterate each dependent to remove one dependency (`done`), + * adding dependents to the worklist where possible. */ - unsigned i; - BITSET_FOREACH_SET(i, st.dependents[idx], st.count) { - assert(st.dep_counts[i] != 0); - unsigned new_deps = --st.dep_counts[i]; + unsigned i; + BITSET_FOREACH_SET(i, st.dependents[idx], st.count) { + assert(st.dep_counts[i] != 0); + unsigned new_deps = --st.dep_counts[i]; - if (new_deps == 0) - BITSET_SET(st.worklist, i); - } + if (new_deps == 0) + BITSET_SET(st.worklist, i); + } - free(st.dependents[idx]); + free(st.dependents[idx]); } /* Scheduler predicates */ @@ -510,9 +513,9 @@ bi_update_worklist(struct bi_worklist st, unsigned idx) static bool bi_can_iaddc(bi_instr *ins) { - return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate && - ins->src[0].swizzle == BI_SWIZZLE_H01 && - ins->src[1].swizzle == BI_SWIZZLE_H01); + return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate && + ins->src[0].swizzle == BI_SWIZZLE_H01 && + ins->src[1].swizzle == BI_SWIZZLE_H01); } /* @@ -523,57 +526,57 @@ bi_can_iaddc(bi_instr *ins) static bool bi_impacted_abs(bi_instr *I) { - return I->src[0].abs && I->src[1].abs && - bi_is_word_equiv(I->src[0], I->src[1]); + return I->src[0].abs && I->src[1].abs && + bi_is_word_equiv(I->src[0], I->src[1]); } bool bi_can_fma(bi_instr *ins) { - /* +IADD.i32 -> *IADDC.i32 */ - if (bi_can_iaddc(ins)) - return true; + /* +IADD.i32 -> *IADDC.i32 */ + if (bi_can_iaddc(ins)) + return true; - /* +MUX -> *CSEL */ - if (bi_can_replace_with_csel(ins)) - return true; + /* +MUX -> *CSEL */ + if (bi_can_replace_with_csel(ins)) + return true; - /* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */ - if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins)) - return false; + /* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */ + if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins)) + return false; - /* TODO: some additional fp16 constraints */ - return bi_opcode_props[ins->op].fma; + /* TODO: some additional fp16 constraints */ + return bi_opcode_props[ins->op].fma; } static bool bi_impacted_fadd_widens(bi_instr *I) { - enum bi_swizzle swz0 = I->src[0].swizzle; - enum bi_swizzle swz1 = I->src[1].swizzle; + enum bi_swizzle swz0 = I->src[0].swizzle; + enum bi_swizzle swz1 = I->src[1].swizzle; - return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) || - (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) || - (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00); + return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) || + (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) || + (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00); } bool bi_can_add(bi_instr *ins) { - /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */ - if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp) - return false; + /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */ + if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp) + return false; - /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */ - if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs)) - return false; + /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */ + if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs)) + return false; - /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */ - if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins)) - return false; + /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */ + if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins)) + return false; - /* TODO: some additional fp16 constraints */ - return bi_opcode_props[ins->op].add; + /* TODO: some additional fp16 constraints */ + return bi_opcode_props[ins->op].add; } /* Architecturally, no single instruction has a "not last" constraint. However, @@ -589,7 +592,7 @@ bi_can_add(bi_instr *ins) static bool bi_must_not_last(bi_instr *ins) { - return (ins->nr_dests >= 2) && (ins->op != BI_OPCODE_TEXC_DUAL); + return (ins->nr_dests >= 2) && (ins->op != BI_OPCODE_TEXC_DUAL); } /* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we @@ -601,115 +604,115 @@ bi_must_not_last(bi_instr *ins) bool bi_must_message(bi_instr *ins) { - return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) || - (ins->op == BI_OPCODE_DISCARD_F32); + return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) || + (ins->op == BI_OPCODE_DISCARD_F32); } static bool bi_fma_atomic(enum bi_opcode op) { - switch (op) { - case BI_OPCODE_ATOM_C_I32: - case BI_OPCODE_ATOM_C_I64: - case BI_OPCODE_ATOM_C1_I32: - case BI_OPCODE_ATOM_C1_I64: - case BI_OPCODE_ATOM_C1_RETURN_I32: - case BI_OPCODE_ATOM_C1_RETURN_I64: - case BI_OPCODE_ATOM_C_RETURN_I32: - case BI_OPCODE_ATOM_C_RETURN_I64: - case BI_OPCODE_ATOM_POST_I32: - case BI_OPCODE_ATOM_POST_I64: - case BI_OPCODE_ATOM_PRE_I64: - return true; - default: - return false; - } + switch (op) { + case BI_OPCODE_ATOM_C_I32: + case BI_OPCODE_ATOM_C_I64: + case BI_OPCODE_ATOM_C1_I32: + case BI_OPCODE_ATOM_C1_I64: + case BI_OPCODE_ATOM_C1_RETURN_I32: + case BI_OPCODE_ATOM_C1_RETURN_I64: + case BI_OPCODE_ATOM_C_RETURN_I32: + case BI_OPCODE_ATOM_C_RETURN_I64: + case BI_OPCODE_ATOM_POST_I32: + case BI_OPCODE_ATOM_POST_I64: + case BI_OPCODE_ATOM_PRE_I64: + return true; + default: + return false; + } } bool bi_reads_zero(bi_instr *ins) { - return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD); + return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD); } bool bi_reads_temps(bi_instr *ins, unsigned src) { - switch (ins->op) { - /* Cannot permute a temporary */ - case BI_OPCODE_CLPER_I32: - case BI_OPCODE_CLPER_OLD_I32: - return src != 0; + switch (ins->op) { + /* Cannot permute a temporary */ + case BI_OPCODE_CLPER_I32: + case BI_OPCODE_CLPER_OLD_I32: + return src != 0; - /* ATEST isn't supposed to be restricted, but in practice it always - * wants to source its coverage mask input (source 0) from register 60, - * which won't work properly if we put the input in a temp. This - * requires workarounds in both RA and clause scheduling. - */ - case BI_OPCODE_ATEST: - return src != 0; + /* ATEST isn't supposed to be restricted, but in practice it always + * wants to source its coverage mask input (source 0) from register 60, + * which won't work properly if we put the input in a temp. This + * requires workarounds in both RA and clause scheduling. + */ + case BI_OPCODE_ATEST: + return src != 0; - case BI_OPCODE_IMULD: - return false; - default: - return true; - } + case BI_OPCODE_IMULD: + return false; + default: + return true; + } } static bool bi_impacted_t_modifiers(bi_instr *I, unsigned src) { - assert(src < I->nr_srcs); - enum bi_swizzle swizzle = I->src[src].swizzle; + assert(src < I->nr_srcs); + enum bi_swizzle swizzle = I->src[src].swizzle; - switch (I->op) { - case BI_OPCODE_F16_TO_F32: - case BI_OPCODE_F16_TO_S32: - case BI_OPCODE_F16_TO_U32: - case BI_OPCODE_MKVEC_V2I16: - case BI_OPCODE_S16_TO_F32: - case BI_OPCODE_S16_TO_S32: - case BI_OPCODE_U16_TO_F32: - case BI_OPCODE_U16_TO_U32: - return (swizzle != BI_SWIZZLE_H00); + switch (I->op) { + case BI_OPCODE_F16_TO_F32: + case BI_OPCODE_F16_TO_S32: + case BI_OPCODE_F16_TO_U32: + case BI_OPCODE_MKVEC_V2I16: + case BI_OPCODE_S16_TO_F32: + case BI_OPCODE_S16_TO_S32: + case BI_OPCODE_U16_TO_F32: + case BI_OPCODE_U16_TO_U32: + return (swizzle != BI_SWIZZLE_H00); - case BI_OPCODE_BRANCH_F32: - case BI_OPCODE_LOGB_F32: - case BI_OPCODE_ILOGB_F32: - case BI_OPCODE_FADD_F32: - case BI_OPCODE_FCMP_F32: - case BI_OPCODE_FREXPE_F32: - case BI_OPCODE_FREXPM_F32: - case BI_OPCODE_FROUND_F32: - return (swizzle != BI_SWIZZLE_H01); + case BI_OPCODE_BRANCH_F32: + case BI_OPCODE_LOGB_F32: + case BI_OPCODE_ILOGB_F32: + case BI_OPCODE_FADD_F32: + case BI_OPCODE_FCMP_F32: + case BI_OPCODE_FREXPE_F32: + case BI_OPCODE_FREXPM_F32: + case BI_OPCODE_FROUND_F32: + return (swizzle != BI_SWIZZLE_H01); - case BI_OPCODE_IADD_S32: - case BI_OPCODE_IADD_U32: - case BI_OPCODE_ISUB_S32: - case BI_OPCODE_ISUB_U32: - case BI_OPCODE_IADD_V4S8: - case BI_OPCODE_IADD_V4U8: - case BI_OPCODE_ISUB_V4S8: - case BI_OPCODE_ISUB_V4U8: - return (src == 1) && (swizzle != BI_SWIZZLE_H01); + case BI_OPCODE_IADD_S32: + case BI_OPCODE_IADD_U32: + case BI_OPCODE_ISUB_S32: + case BI_OPCODE_ISUB_U32: + case BI_OPCODE_IADD_V4S8: + case BI_OPCODE_IADD_V4U8: + case BI_OPCODE_ISUB_V4S8: + case BI_OPCODE_ISUB_V4U8: + return (src == 1) && (swizzle != BI_SWIZZLE_H01); - case BI_OPCODE_S8_TO_F32: - case BI_OPCODE_S8_TO_S32: - case BI_OPCODE_U8_TO_F32: - case BI_OPCODE_U8_TO_U32: - return (swizzle != BI_SWIZZLE_B0000); + case BI_OPCODE_S8_TO_F32: + case BI_OPCODE_S8_TO_S32: + case BI_OPCODE_U8_TO_F32: + case BI_OPCODE_U8_TO_U32: + return (swizzle != BI_SWIZZLE_B0000); - case BI_OPCODE_V2S8_TO_V2F16: - case BI_OPCODE_V2S8_TO_V2S16: - case BI_OPCODE_V2U8_TO_V2F16: - case BI_OPCODE_V2U8_TO_V2U16: - return (swizzle != BI_SWIZZLE_B0022); + case BI_OPCODE_V2S8_TO_V2F16: + case BI_OPCODE_V2S8_TO_V2S16: + case BI_OPCODE_V2U8_TO_V2F16: + case BI_OPCODE_V2U8_TO_V2U16: + return (swizzle != BI_SWIZZLE_B0022); - case BI_OPCODE_IADD_V2S16: - case BI_OPCODE_IADD_V2U16: - case BI_OPCODE_ISUB_V2S16: - case BI_OPCODE_ISUB_V2U16: - return (src == 1) && (swizzle >= BI_SWIZZLE_H11); + case BI_OPCODE_IADD_V2S16: + case BI_OPCODE_IADD_V2U16: + case BI_OPCODE_ISUB_V2S16: + case BI_OPCODE_ISUB_V2U16: + return (src == 1) && (swizzle >= BI_SWIZZLE_H11); #if 0 /* Restriction on IADD in 64-bit clauses on G72 */ @@ -718,52 +721,52 @@ bi_impacted_t_modifiers(bi_instr *I, unsigned src) return (src == 1) && (swizzle != BI_SWIZZLE_D0); #endif - default: - return false; - } + default: + return false; + } } bool bi_reads_t(bi_instr *ins, unsigned src) { - /* Branch offset cannot come from passthrough */ - if (bi_opcode_props[ins->op].branch) - return src != 2; + /* Branch offset cannot come from passthrough */ + if (bi_opcode_props[ins->op].branch) + return src != 2; - /* Table can never read passthrough */ - if (bi_opcode_props[ins->op].table) - return false; + /* Table can never read passthrough */ + if (bi_opcode_props[ins->op].table) + return false; - /* Staging register reads may happen before the succeeding register - * block encodes a write, so effectively there is no passthrough */ - if (bi_is_staging_src(ins, src)) - return false; + /* Staging register reads may happen before the succeeding register + * block encodes a write, so effectively there is no passthrough */ + if (bi_is_staging_src(ins, src)) + return false; - /* Bifrost cores newer than Mali G71 have restrictions on swizzles on - * same-cycle temporaries. Check the list for these hazards. */ - if (bi_impacted_t_modifiers(ins, src)) - return false; + /* Bifrost cores newer than Mali G71 have restrictions on swizzles on + * same-cycle temporaries. Check the list for these hazards. */ + if (bi_impacted_t_modifiers(ins, src)) + return false; - /* Descriptor must not come from a passthrough */ - switch (ins->op) { - case BI_OPCODE_LD_CVT: - case BI_OPCODE_LD_TILE: - case BI_OPCODE_ST_CVT: - case BI_OPCODE_ST_TILE: - case BI_OPCODE_TEXC: - case BI_OPCODE_TEXC_DUAL: - return src != 2; - case BI_OPCODE_BLEND: - return src != 2 && src != 3; + /* Descriptor must not come from a passthrough */ + switch (ins->op) { + case BI_OPCODE_LD_CVT: + case BI_OPCODE_LD_TILE: + case BI_OPCODE_ST_CVT: + case BI_OPCODE_ST_TILE: + case BI_OPCODE_TEXC: + case BI_OPCODE_TEXC_DUAL: + return src != 2; + case BI_OPCODE_BLEND: + return src != 2 && src != 3; - /* +JUMP can't read the offset from T */ - case BI_OPCODE_JUMP: - return false; + /* +JUMP can't read the offset from T */ + case BI_OPCODE_JUMP: + return false; - /* Else, just check if we can read any temps */ - default: - return bi_reads_temps(ins, src); - } + /* Else, just check if we can read any temps */ + default: + return bi_reads_temps(ins, src); + } } /* Counts the number of 64-bit constants required by a clause. TODO: We @@ -773,12 +776,12 @@ bi_reads_t(bi_instr *ins, unsigned src) static unsigned bi_nconstants(struct bi_clause_state *clause) { - unsigned count_32 = 0; + unsigned count_32 = 0; - for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i) - count_32 += clause->consts[i].constant_count; + for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i) + count_32 += clause->consts[i].constant_count; - return DIV_ROUND_UP(count_32, 2); + return DIV_ROUND_UP(count_32, 2); } /* Would there be space for constants if we added one tuple? */ @@ -786,7 +789,7 @@ bi_nconstants(struct bi_clause_state *clause) static bool bi_space_for_more_constants(struct bi_clause_state *clause) { - return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1)); + return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1)); } /* Updates the FAU assignment for a tuple. A valid FAU assignment must be @@ -795,85 +798,83 @@ bi_space_for_more_constants(struct bi_clause_state *clause) * bi_instr_schedulable */ static bool -bi_update_fau(struct bi_clause_state *clause, - struct bi_tuple_state *tuple, - bi_instr *instr, bool fma, bool destructive) +bi_update_fau(struct bi_clause_state *clause, struct bi_tuple_state *tuple, + bi_instr *instr, bool fma, bool destructive) { - /* Maintain our own constants, for nondestructive mode */ - uint32_t copied_constants[2], copied_count; - unsigned *constant_count = &tuple->constant_count; - uint32_t *constants = tuple->constants; - enum bir_fau fau = tuple->fau; + /* Maintain our own constants, for nondestructive mode */ + uint32_t copied_constants[2], copied_count; + unsigned *constant_count = &tuple->constant_count; + uint32_t *constants = tuple->constants; + enum bir_fau fau = tuple->fau; - if (!destructive) { - memcpy(copied_constants, tuple->constants, - (*constant_count) * sizeof(constants[0])); - copied_count = tuple->constant_count; + if (!destructive) { + memcpy(copied_constants, tuple->constants, + (*constant_count) * sizeof(constants[0])); + copied_count = tuple->constant_count; - constant_count = &copied_count; - constants = copied_constants; - } + constant_count = &copied_count; + constants = copied_constants; + } - bi_foreach_src(instr, s) { - bi_index src = instr->src[s]; + bi_foreach_src(instr, s) { + bi_index src = instr->src[s]; - if (src.type == BI_INDEX_FAU) { - bool no_constants = *constant_count == 0; - bool no_other_fau = (fau == src.value) || !fau; - bool mergable = no_constants && no_other_fau; + if (src.type == BI_INDEX_FAU) { + bool no_constants = *constant_count == 0; + bool no_other_fau = (fau == src.value) || !fau; + bool mergable = no_constants && no_other_fau; - if (destructive) { - assert(mergable); - tuple->fau = src.value; - } else if (!mergable) { - return false; - } + if (destructive) { + assert(mergable); + tuple->fau = src.value; + } else if (!mergable) { + return false; + } - fau = src.value; - } else if (src.type == BI_INDEX_CONSTANT) { - /* No need to reserve space if we have a fast 0 */ - if (src.value == 0 && fma && bi_reads_zero(instr)) - continue; + fau = src.value; + } else if (src.type == BI_INDEX_CONSTANT) { + /* No need to reserve space if we have a fast 0 */ + if (src.value == 0 && fma && bi_reads_zero(instr)) + continue; - /* If there is a branch target, #0 by convention is the - * PC-relative offset to the target */ - bool pcrel = instr->branch_target && src.value == 0; - bool found = false; + /* If there is a branch target, #0 by convention is the + * PC-relative offset to the target */ + bool pcrel = instr->branch_target && src.value == 0; + bool found = false; - for (unsigned i = 0; i < *constant_count; ++i) { - found |= (constants[i] == src.value) && - (i != tuple->pcrel_idx); - } + for (unsigned i = 0; i < *constant_count; ++i) { + found |= (constants[i] == src.value) && (i != tuple->pcrel_idx); + } - /* pcrel constants are unique, so don't match */ - if (found && !pcrel) - continue; + /* pcrel constants are unique, so don't match */ + if (found && !pcrel) + continue; - bool no_fau = (*constant_count > 0) || !fau; - bool mergable = no_fau && ((*constant_count) < 2); + bool no_fau = (*constant_count > 0) || !fau; + bool mergable = no_fau && ((*constant_count) < 2); - if (destructive) { - assert(mergable); + if (destructive) { + assert(mergable); - if (pcrel) - tuple->pcrel_idx = *constant_count; - } else if (!mergable) - return false; + if (pcrel) + tuple->pcrel_idx = *constant_count; + } else if (!mergable) + return false; - constants[(*constant_count)++] = src.value; - } - } + constants[(*constant_count)++] = src.value; + } + } - /* Constants per clause may be limited by tuple count */ - bool room_for_constants = (*constant_count == 0) || - bi_space_for_more_constants(clause); + /* Constants per clause may be limited by tuple count */ + bool room_for_constants = + (*constant_count == 0) || bi_space_for_more_constants(clause); - if (destructive) - assert(room_for_constants); - else if (!room_for_constants) - return false; + if (destructive) + assert(room_for_constants); + else if (!room_for_constants) + return false; - return true; + return true; } /* Given an in-progress tuple, a candidate new instruction to add to the tuple, @@ -886,28 +887,28 @@ bi_update_fau(struct bi_clause_state *clause, static bool bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx) { - assert(src_idx < instr->nr_srcs); - bi_index src = instr->src[src_idx]; + assert(src_idx < instr->nr_srcs); + bi_index src = instr->src[src_idx]; - /* Only consider sources which come from the register file */ - if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER)) - return false; + /* Only consider sources which come from the register file */ + if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER)) + return false; - /* Staging register reads bypass the usual register file mechanism */ - if (bi_is_staging_src(instr, src_idx)) - return false; + /* Staging register reads bypass the usual register file mechanism */ + if (bi_is_staging_src(instr, src_idx)) + return false; - /* If a source is already read in the tuple, it is already counted */ - for (unsigned t = 0; t < reg->nr_reads; ++t) - if (bi_is_word_equiv(src, reg->reads[t])) - return false; + /* If a source is already read in the tuple, it is already counted */ + for (unsigned t = 0; t < reg->nr_reads; ++t) + if (bi_is_word_equiv(src, reg->reads[t])) + return false; - /* If a source is read in _this instruction_, it is already counted */ - for (unsigned t = 0; t < src_idx; ++t) - if (bi_is_word_equiv(src, instr->src[t])) - return false; + /* If a source is read in _this instruction_, it is already counted */ + for (unsigned t = 0; t < src_idx; ++t) + if (bi_is_word_equiv(src, instr->src[t])) + return false; - return true; + return true; } /* Given two tuples in source order, count the number of register reads of the @@ -916,31 +917,31 @@ bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx) */ static unsigned -bi_count_succ_reads(bi_index t0, bi_index t1, - bi_index *succ_reads, unsigned nr_succ_reads) +bi_count_succ_reads(bi_index t0, bi_index t1, bi_index *succ_reads, + unsigned nr_succ_reads) { - unsigned reads = 0; + unsigned reads = 0; - for (unsigned i = 0; i < nr_succ_reads; ++i) { - bool unique = true; + for (unsigned i = 0; i < nr_succ_reads; ++i) { + bool unique = true; - for (unsigned j = 0; j < i; ++j) - if (bi_is_word_equiv(succ_reads[i], succ_reads[j])) - unique = false; + for (unsigned j = 0; j < i; ++j) + if (bi_is_word_equiv(succ_reads[i], succ_reads[j])) + unique = false; - if (!unique) - continue; + if (!unique) + continue; - if (bi_is_word_equiv(succ_reads[i], t0)) - continue; + if (bi_is_word_equiv(succ_reads[i], t0)) + continue; - if (bi_is_word_equiv(succ_reads[i], t1)) - continue; + if (bi_is_word_equiv(succ_reads[i], t1)) + continue; - reads++; - } + reads++; + } - return reads; + return reads; } /* Not all instructions can read from the staging passthrough (as determined by @@ -951,23 +952,23 @@ bi_count_succ_reads(bi_index t0, bi_index t1, static bool bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add) { - bi_foreach_src(add, s) { - bi_index src = add->src[s]; + bi_foreach_src(add, s) { + bi_index src = add->src[s]; - if (src.type != BI_INDEX_REGISTER) - continue; + if (src.type != BI_INDEX_REGISTER) + continue; - unsigned count = bi_count_read_registers(add, s); - bool read = false; + unsigned count = bi_count_read_registers(add, s); + bool read = false; - for (unsigned d = 0; d < count; ++d) - read |= bi_is_equiv(fma, bi_register(src.value + d)); + for (unsigned d = 0; d < count; ++d) + read |= bi_is_equiv(fma, bi_register(src.value + d)); - if (read && !bi_reads_t(add, s)) - return true; - } + if (read && !bi_reads_t(add, s)) + return true; + } - return false; + return false; } /* Likewise for cross-tuple passthrough (reads_temps) */ @@ -975,18 +976,18 @@ bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add) static bool bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins) { - if (ins->nr_dests == 0) - return false; + if (ins->nr_dests == 0) + return false; - bi_foreach_instr_in_tuple(succ, pins) { - bi_foreach_src(pins, s) { - if (bi_is_word_equiv(ins->dest[0], pins->src[s]) && - !bi_reads_temps(pins, s)) - return true; - } - } + bi_foreach_instr_in_tuple(succ, pins) { + bi_foreach_src(pins, s) { + if (bi_is_word_equiv(ins->dest[0], pins->src[s]) && + !bi_reads_temps(pins, s)) + return true; + } + } - return false; + return false; } /* Is a register written other than the staging mechanism? ATEST is special, @@ -998,21 +999,21 @@ bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins) static unsigned bi_write_count(bi_instr *instr, uint64_t live_after_temp) { - if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND) - return 1; + if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND) + return 1; - unsigned count = 0; + unsigned count = 0; - bi_foreach_dest(instr, d) { - if (d == 0 && bi_opcode_props[instr->op].sr_write) - continue; + bi_foreach_dest(instr, d) { + if (d == 0 && bi_opcode_props[instr->op].sr_write) + continue; - assert(instr->dest[0].type == BI_INDEX_REGISTER); - if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value)) - count++; - } + assert(instr->dest[0].type == BI_INDEX_REGISTER); + if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value)) + count++; + } - return count; + return count; } /* @@ -1022,8 +1023,9 @@ bi_write_count(bi_instr *instr, uint64_t live_after_temp) static bool bi_needs_ftz(bi_instr *I) { - return (I->op == BI_OPCODE_F16_TO_F32 || - I->op == BI_OPCODE_V2F32_TO_V2F16) && I->ftz; + return (I->op == BI_OPCODE_F16_TO_F32 || + I->op == BI_OPCODE_V2F32_TO_V2F16) && + I->ftz; } /* @@ -1033,8 +1035,8 @@ bi_needs_ftz(bi_instr *I) static bool bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr) { - return (clause->ftz != BI_FTZ_STATE_NONE) && - ((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr)); + return (clause->ftz != BI_FTZ_STATE_NONE) && + ((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr)); } /* Instruction placement entails two questions: what subset of instructions in @@ -1045,209 +1047,208 @@ bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr) * whitepaper. The cost function is a heuristic. */ static bool -bi_instr_schedulable(bi_instr *instr, - struct bi_clause_state *clause, - struct bi_tuple_state *tuple, - uint64_t live_after_temp, - bool fma) +bi_instr_schedulable(bi_instr *instr, struct bi_clause_state *clause, + struct bi_tuple_state *tuple, uint64_t live_after_temp, + bool fma) { - /* The units must match */ - if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr))) - return false; + /* The units must match */ + if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr))) + return false; - /* There can only be one message-passing instruction per clause */ - if (bi_must_message(instr) && clause->message) - return false; + /* There can only be one message-passing instruction per clause */ + if (bi_must_message(instr) && clause->message) + return false; - /* Some instructions have placement requirements */ - if (bi_opcode_props[instr->op].last && !tuple->last) - return false; + /* Some instructions have placement requirements */ + if (bi_opcode_props[instr->op].last && !tuple->last) + return false; - if (bi_must_not_last(instr) && tuple->last) - return false; + if (bi_must_not_last(instr) && tuple->last) + return false; - /* Numerical properties must be compatible with the clause */ - if (bi_numerically_incompatible(clause, instr)) - return false; + /* Numerical properties must be compatible with the clause */ + if (bi_numerically_incompatible(clause, instr)) + return false; - /* Message-passing instructions are not guaranteed write within the - * same clause (most likely they will not), so if a later instruction - * in the clause accesses the destination, the message-passing - * instruction can't be scheduled */ - if (bi_opcode_props[instr->op].sr_write) { - bi_foreach_dest(instr, d) { - unsigned nr = bi_count_write_registers(instr, d); - assert(instr->dest[d].type == BI_INDEX_REGISTER); - unsigned reg = instr->dest[d].value; + /* Message-passing instructions are not guaranteed write within the + * same clause (most likely they will not), so if a later instruction + * in the clause accesses the destination, the message-passing + * instruction can't be scheduled */ + if (bi_opcode_props[instr->op].sr_write) { + bi_foreach_dest(instr, d) { + unsigned nr = bi_count_write_registers(instr, d); + assert(instr->dest[d].type == BI_INDEX_REGISTER); + unsigned reg = instr->dest[d].value; - for (unsigned i = 0; i < clause->access_count; ++i) { - bi_index idx = clause->accesses[i]; - for (unsigned d = 0; d < nr; ++d) { - if (bi_is_equiv(bi_register(reg + d), idx)) - return false; - } - } - } - } + for (unsigned i = 0; i < clause->access_count; ++i) { + bi_index idx = clause->accesses[i]; + for (unsigned d = 0; d < nr; ++d) { + if (bi_is_equiv(bi_register(reg + d), idx)) + return false; + } + } + } + } - if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) { - unsigned nr = bi_count_read_registers(instr, 0); - assert(instr->src[0].type == BI_INDEX_REGISTER); - unsigned reg = instr->src[0].value; + if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) { + unsigned nr = bi_count_read_registers(instr, 0); + assert(instr->src[0].type == BI_INDEX_REGISTER); + unsigned reg = instr->src[0].value; - for (unsigned i = 0; i < clause->access_count; ++i) { - bi_index idx = clause->accesses[i]; - for (unsigned d = 0; d < nr; ++d) { - if (bi_is_equiv(bi_register(reg + d), idx)) - return false; - } - } - } + for (unsigned i = 0; i < clause->access_count; ++i) { + bi_index idx = clause->accesses[i]; + for (unsigned d = 0; d < nr; ++d) { + if (bi_is_equiv(bi_register(reg + d), idx)) + return false; + } + } + } - /* If FAU is already assigned, we may not disrupt that. Do a - * non-disruptive test update */ - if (!bi_update_fau(clause, tuple, instr, fma, false)) - return false; + /* If FAU is already assigned, we may not disrupt that. Do a + * non-disruptive test update */ + if (!bi_update_fau(clause, tuple, instr, fma, false)) + return false; - /* If this choice of FMA would force a staging passthrough, the ADD - * instruction must support such a passthrough */ - if (tuple->add && instr->nr_dests && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add)) - return false; + /* If this choice of FMA would force a staging passthrough, the ADD + * instruction must support such a passthrough */ + if (tuple->add && instr->nr_dests && + bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add)) + return false; - /* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */ - if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr)) - return false; + /* If this choice of destination would force a cross-tuple passthrough, the + * next tuple must support that */ + if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr)) + return false; - /* Register file writes are limited */ - unsigned total_writes = tuple->reg.nr_writes; - total_writes += bi_write_count(instr, live_after_temp); + /* Register file writes are limited */ + unsigned total_writes = tuple->reg.nr_writes; + total_writes += bi_write_count(instr, live_after_temp); - /* Last tuple in a clause can only write a single value */ - if (tuple->last && total_writes > 1) - return false; + /* Last tuple in a clause can only write a single value */ + if (tuple->last && total_writes > 1) + return false; - /* Register file reads are limited, so count unique */ + /* Register file reads are limited, so count unique */ - unsigned unique_new_srcs = 0; + unsigned unique_new_srcs = 0; - bi_foreach_src(instr, s) { - if (bi_tuple_is_new_src(instr, &tuple->reg, s)) - unique_new_srcs++; - } + bi_foreach_src(instr, s) { + if (bi_tuple_is_new_src(instr, &tuple->reg, s)) + unique_new_srcs++; + } - unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs; + unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs; - bool can_spill_to_moves = (!tuple->add); - can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2)); - can_spill_to_moves &= (clause->tuple_count < 7); + bool can_spill_to_moves = (!tuple->add); + can_spill_to_moves &= + (bi_nconstants(clause) < 13 - (clause->tuple_count + 2)); + can_spill_to_moves &= (clause->tuple_count < 7); - /* However, we can get an extra 1 or 2 sources by inserting moves */ - if (total_srcs > (can_spill_to_moves ? 4 : 3)) - return false; + /* However, we can get an extra 1 or 2 sources by inserting moves */ + if (total_srcs > (can_spill_to_moves ? 4 : 3)) + return false; - /* Count effective reads for the successor */ - unsigned succ_reads = 0; + /* Count effective reads for the successor */ + unsigned succ_reads = 0; - if (instr->nr_dests) { - bool has_t1 = tuple->add && tuple->add->nr_dests; - succ_reads = bi_count_succ_reads(instr->dest[0], - has_t1 ? tuple->add->dest[0] : bi_null(), - tuple->prev_reads, - tuple->nr_prev_reads); - } + if (instr->nr_dests) { + bool has_t1 = tuple->add && tuple->add->nr_dests; + succ_reads = bi_count_succ_reads(instr->dest[0], + has_t1 ? tuple->add->dest[0] : bi_null(), + tuple->prev_reads, tuple->nr_prev_reads); + } - /* Successor must satisfy R+W <= 4, so we require W <= 4-R */ - if ((signed) total_writes > (4 - (signed) succ_reads)) - return false; + /* Successor must satisfy R+W <= 4, so we require W <= 4-R */ + if ((signed)total_writes > (4 - (signed)succ_reads)) + return false; - return true; + return true; } static signed bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple) { - signed cost = 0; + signed cost = 0; - /* Instructions that can schedule to either FMA or to ADD should be - * deprioritized since they're easier to reschedule elsewhere */ - if (bi_can_fma(instr) && bi_can_add(instr)) - cost++; + /* Instructions that can schedule to either FMA or to ADD should be + * deprioritized since they're easier to reschedule elsewhere */ + if (bi_can_fma(instr) && bi_can_add(instr)) + cost++; - /* Message-passing instructions impose constraints on the registers - * later in the clause, so schedule them as late within a clause as - * possible (<==> prioritize them since we're backwards <==> decrease - * cost) */ - if (bi_must_message(instr)) - cost--; + /* Message-passing instructions impose constraints on the registers + * later in the clause, so schedule them as late within a clause as + * possible (<==> prioritize them since we're backwards <==> decrease + * cost) */ + if (bi_must_message(instr)) + cost--; - /* Last instructions are big constraints (XXX: no effect on shader-db) */ - if (bi_opcode_props[instr->op].last) - cost -= 2; + /* Last instructions are big constraints (XXX: no effect on shader-db) */ + if (bi_opcode_props[instr->op].last) + cost -= 2; - return cost; + return cost; } static unsigned -bi_choose_index(struct bi_worklist st, - struct bi_clause_state *clause, - struct bi_tuple_state *tuple, - uint64_t live_after_temp, +bi_choose_index(struct bi_worklist st, struct bi_clause_state *clause, + struct bi_tuple_state *tuple, uint64_t live_after_temp, bool fma) { - unsigned i, best_idx = ~0; - signed best_cost = INT_MAX; + unsigned i, best_idx = ~0; + signed best_cost = INT_MAX; - BITSET_FOREACH_SET(i, st.worklist, st.count) { - bi_instr *instr = st.instructions[i]; + BITSET_FOREACH_SET(i, st.worklist, st.count) { + bi_instr *instr = st.instructions[i]; - if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma)) - continue; + if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma)) + continue; - signed cost = bi_instr_cost(instr, tuple); + signed cost = bi_instr_cost(instr, tuple); - /* Tie break in favour of later instructions, under the - * assumption this promotes temporary usage (reducing pressure - * on the register file). This is a side effect of a prepass - * scheduling for pressure. */ + /* Tie break in favour of later instructions, under the + * assumption this promotes temporary usage (reducing pressure + * on the register file). This is a side effect of a prepass + * scheduling for pressure. */ - if (cost <= best_cost) { - best_idx = i; - best_cost = cost; - } - } + if (cost <= best_cost) { + best_idx = i; + best_cost = cost; + } + } - return best_idx; + return best_idx; } static void bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple, - bi_instr *instr, uint64_t live_after_temp, bool fma) + bi_instr *instr, uint64_t live_after_temp, bool fma) { - bi_update_fau(clause, tuple, instr, fma, true); + bi_update_fau(clause, tuple, instr, fma, true); - assert(clause->access_count + instr->nr_srcs + instr->nr_dests <= ARRAY_SIZE(clause->accesses)); + assert(clause->access_count + instr->nr_srcs + instr->nr_dests <= + ARRAY_SIZE(clause->accesses)); - memcpy(clause->accesses + clause->access_count, - instr->src, sizeof(instr->src[0]) * instr->nr_srcs); - clause->access_count += instr->nr_srcs; + memcpy(clause->accesses + clause->access_count, instr->src, + sizeof(instr->src[0]) * instr->nr_srcs); + clause->access_count += instr->nr_srcs; - memcpy(clause->accesses + clause->access_count, - instr->dest, sizeof(instr->dest[0]) * instr->nr_dests); - clause->access_count += instr->nr_dests; + memcpy(clause->accesses + clause->access_count, instr->dest, + sizeof(instr->dest[0]) * instr->nr_dests); + clause->access_count += instr->nr_dests; - tuple->reg.nr_writes += bi_write_count(instr, live_after_temp); + tuple->reg.nr_writes += bi_write_count(instr, live_after_temp); - bi_foreach_src(instr, s) { - if (bi_tuple_is_new_src(instr, &tuple->reg, s)) - tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s]; - } + bi_foreach_src(instr, s) { + if (bi_tuple_is_new_src(instr, &tuple->reg, s)) + tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s]; + } - /* This could be optimized to allow pairing integer instructions with - * special flush-to-zero instructions, but punting on this until we have - * a workload that cares. - */ - clause->ftz = bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE : - BI_FTZ_STATE_DISABLE; + /* This could be optimized to allow pairing integer instructions with + * special flush-to-zero instructions, but punting on this until we have + * a workload that cares. + */ + clause->ftz = + bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE : BI_FTZ_STATE_DISABLE; } /* Choose the best instruction and pop it off the worklist. Returns NULL if no @@ -1255,74 +1256,71 @@ bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple, static bi_instr * bi_take_instr(bi_context *ctx, struct bi_worklist st, - struct bi_clause_state *clause, - struct bi_tuple_state *tuple, - uint64_t live_after_temp, - bool fma) + struct bi_clause_state *clause, struct bi_tuple_state *tuple, + uint64_t live_after_temp, bool fma) { - if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE) - return bi_lower_cubeface(ctx, clause, tuple); - else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM_RETURN_I32) - return bi_lower_atom_c(ctx, clause, tuple); - else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM1_RETURN_I32) - return bi_lower_atom_c1(ctx, clause, tuple); - else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64) - return bi_lower_seg_add(ctx, clause, tuple); - else if (tuple->add && tuple->add->table) - return bi_lower_dtsel(ctx, clause, tuple); + if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE) + return bi_lower_cubeface(ctx, clause, tuple); + else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM_RETURN_I32) + return bi_lower_atom_c(ctx, clause, tuple); + else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM1_RETURN_I32) + return bi_lower_atom_c1(ctx, clause, tuple); + else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64) + return bi_lower_seg_add(ctx, clause, tuple); + else if (tuple->add && tuple->add->table) + return bi_lower_dtsel(ctx, clause, tuple); - /* TODO: Optimize these moves */ - if (!fma && tuple->nr_prev_reads > 3) { - /* Only spill by one source for now */ - assert(tuple->nr_prev_reads == 4); + /* TODO: Optimize these moves */ + if (!fma && tuple->nr_prev_reads > 3) { + /* Only spill by one source for now */ + assert(tuple->nr_prev_reads == 4); - /* Pick a source to spill */ - bi_index src = tuple->prev_reads[0]; + /* Pick a source to spill */ + bi_index src = tuple->prev_reads[0]; - /* Schedule the spill */ - bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev)); - bi_instr *mov = bi_mov_i32_to(&b, src, src); - bi_pop_instr(clause, tuple, mov, live_after_temp, fma); - return mov; - } + /* Schedule the spill */ + bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev)); + bi_instr *mov = bi_mov_i32_to(&b, src, src); + bi_pop_instr(clause, tuple, mov, live_after_temp, fma); + return mov; + } #ifndef NDEBUG - /* Don't pair instructions if debugging */ - if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add) - return NULL; + /* Don't pair instructions if debugging */ + if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add) + return NULL; #endif - unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma); + unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma); - if (idx >= st.count) - return NULL; + if (idx >= st.count) + return NULL; - /* Update state to reflect taking the instruction */ - bi_instr *instr = st.instructions[idx]; + /* Update state to reflect taking the instruction */ + bi_instr *instr = st.instructions[idx]; - BITSET_CLEAR(st.worklist, idx); - bi_update_worklist(st, idx); - bi_pop_instr(clause, tuple, instr, live_after_temp, fma); + BITSET_CLEAR(st.worklist, idx); + bi_update_worklist(st, idx); + bi_pop_instr(clause, tuple, instr, live_after_temp, fma); - /* Fixups */ - bi_builder b = bi_init_builder(ctx, bi_before_instr(instr)); + /* Fixups */ + bi_builder b = bi_init_builder(ctx, bi_before_instr(instr)); - if (instr->op == BI_OPCODE_IADD_U32 && fma) { - assert(bi_can_iaddc(instr)); - bi_instr *iaddc = - bi_iaddc_i32_to(&b, instr->dest[0], instr->src[0], + if (instr->op == BI_OPCODE_IADD_U32 && fma) { + assert(bi_can_iaddc(instr)); + bi_instr *iaddc = bi_iaddc_i32_to(&b, instr->dest[0], instr->src[0], instr->src[1], bi_zero()); - bi_remove_instruction(instr); - instr = iaddc; - } else if (fma && bi_can_replace_with_csel(instr)) { - bi_instr *csel = bi_csel_from_mux(&b, instr, false); + bi_remove_instruction(instr); + instr = iaddc; + } else if (fma && bi_can_replace_with_csel(instr)) { + bi_instr *csel = bi_csel_from_mux(&b, instr, false); - bi_remove_instruction(instr); - instr = csel; - } + bi_remove_instruction(instr); + instr = csel; + } - return instr; + return instr; } /* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting @@ -1331,26 +1329,25 @@ bi_take_instr(bi_context *ctx, struct bi_worklist st, * passthrough (which is impossible) */ static void -bi_use_passthrough(bi_instr *ins, bi_index old, - enum bifrost_packed_src new, - bool except_sr) +bi_use_passthrough(bi_instr *ins, bi_index old, enum bifrost_packed_src new, + bool except_sr) { - /* Optional for convenience */ - if (!ins) - return; + /* Optional for convenience */ + if (!ins) + return; - assert(!bi_is_null(old)); + assert(!bi_is_null(old)); - bi_foreach_src(ins, i) { - if ((i == 0 || i == 4) && except_sr) - continue; + bi_foreach_src(ins, i) { + if ((i == 0 || i == 4) && except_sr) + continue; - if (bi_is_word_equiv(ins->src[i], old)) { - ins->src[i].type = BI_INDEX_PASS; - ins->src[i].value = new; - ins->src[i].offset = 0; - } - } + if (bi_is_word_equiv(ins->src[i], old)) { + ins->src[i].type = BI_INDEX_PASS; + ins->src[i].value = new; + ins->src[i].offset = 0; + } + } } /* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use @@ -1364,43 +1361,48 @@ bi_use_passthrough(bi_instr *ins, bi_index old, static void bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ) { - bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false; + bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false; - if (prec.add && prec.add->nr_dests) { - bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false); - bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read); - } + if (prec.add && prec.add->nr_dests) { + bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, + false); + bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, + sr_read); + } - if (prec.fma && prec.fma->nr_dests) { - bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false); - bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read); - } + if (prec.fma && prec.fma->nr_dests) { + bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, + false); + bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, + sr_read); + } } static void bi_rewrite_fau_to_pass(bi_tuple *tuple) { - bi_foreach_instr_and_src_in_tuple(tuple, ins, s) { - if (ins->src[s].type != BI_INDEX_FAU) continue; + bi_foreach_instr_and_src_in_tuple(tuple, ins, s) { + if (ins->src[s].type != BI_INDEX_FAU) + continue; - bi_index pass = bi_passthrough(ins->src[s].offset ? - BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO); + bi_index pass = bi_passthrough(ins->src[s].offset ? BIFROST_SRC_FAU_HI + : BIFROST_SRC_FAU_LO); - bi_replace_src(ins, s, pass); - } + bi_replace_src(ins, s, pass); + } } static void bi_rewrite_zero(bi_instr *ins, bool fma) { - bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO); + bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO); - bi_foreach_src(ins, s) { - bi_index src = ins->src[s]; + bi_foreach_src(ins, s) { + bi_index src = ins->src[s]; - if (src.type == BI_INDEX_CONSTANT && src.value == 0) - bi_replace_src(ins, s, zero); - } + if (src.type == BI_INDEX_CONSTANT && src.value == 0) + bi_replace_src(ins, s, zero); + } } /* Assumes #0 to {T, FAU} rewrite has already occurred */ @@ -1408,31 +1410,32 @@ bi_rewrite_zero(bi_instr *ins, bool fma) static void bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel) { - bi_foreach_instr_and_src_in_tuple(tuple, ins, s) { - if (ins->src[s].type != BI_INDEX_CONSTANT) continue; + bi_foreach_instr_and_src_in_tuple(tuple, ins, s) { + if (ins->src[s].type != BI_INDEX_CONSTANT) + continue; - uint32_t cons = ins->src[s].value; + uint32_t cons = ins->src[s].value; - ASSERTED bool lo = (cons == (constant & 0xffffffff)); - bool hi = (cons == (constant >> 32ull)); + ASSERTED bool lo = (cons == (constant & 0xffffffff)); + bool hi = (cons == (constant >> 32ull)); - /* PC offsets always live in the upper half, set to zero by - * convention before pack time. (This is safe, since if you - * wanted to compare against zero, you would use a BRANCHZ - * instruction instead.) */ - if (cons == 0 && ins->branch_target != NULL) { - assert(pcrel); - hi = true; - lo = false; - } else if (pcrel) { - hi = false; - } + /* PC offsets always live in the upper half, set to zero by + * convention before pack time. (This is safe, since if you + * wanted to compare against zero, you would use a BRANCHZ + * instruction instead.) */ + if (cons == 0 && ins->branch_target != NULL) { + assert(pcrel); + hi = true; + lo = false; + } else if (pcrel) { + hi = false; + } - assert(lo || hi); + assert(lo || hi); - bi_replace_src(ins, s, - bi_passthrough(hi ? BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO)); - } + bi_replace_src( + ins, s, bi_passthrough(hi ? BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO)); + } } /* Constructs a constant state given a tuple state. This has the @@ -1443,25 +1446,25 @@ bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel) static struct bi_const_state bi_get_const_state(struct bi_tuple_state *tuple) { - struct bi_const_state consts = { - .constant_count = tuple->constant_count, - .constants[0] = tuple->constants[0], - .constants[1] = tuple->constants[1], - .pcrel = tuple->add && tuple->add->branch_target, - }; + struct bi_const_state consts = { + .constant_count = tuple->constant_count, + .constants[0] = tuple->constants[0], + .constants[1] = tuple->constants[1], + .pcrel = tuple->add && tuple->add->branch_target, + }; - /* pcrel applies to the first constant by convention, and - * PC-relative constants will be #0 by convention here, so swap - * to match if needed */ - if (consts.pcrel && consts.constants[0]) { - assert(consts.constant_count == 2); - assert(consts.constants[1] == 0); + /* pcrel applies to the first constant by convention, and + * PC-relative constants will be #0 by convention here, so swap + * to match if needed */ + if (consts.pcrel && consts.constants[0]) { + assert(consts.constant_count == 2); + assert(consts.constants[1] == 0); - consts.constants[1] = consts.constants[0]; - consts.constants[0] = 0; - } + consts.constants[1] = consts.constants[0]; + consts.constants[0] = 0; + } - return consts; + return consts; } /* Merges constants in a clause, satisfying the following rules, assuming no @@ -1483,119 +1486,122 @@ bi_get_const_state(struct bi_tuple_state *tuple) static uint64_t bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel) { - /* At this point in the constant merge algorithm, pcrel constants are - * treated as zero, so pcrel implies at least one constants is zero */ - assert(!pcrel || (c0 == 0 || c1 == 0)); + /* At this point in the constant merge algorithm, pcrel constants are + * treated as zero, so pcrel implies at least one constants is zero */ + assert(!pcrel || (c0 == 0 || c1 == 0)); - /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */ - uint32_t hi = pcrel ? 0 : MAX2(c0, c1); - uint32_t lo = (c0 == hi) ? c1 : c0; + /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */ + uint32_t hi = pcrel ? 0 : MAX2(c0, c1); + uint32_t lo = (c0 == hi) ? c1 : c0; - /* Merge in the selected order */ - return lo | (((uint64_t) hi) << 32ull); + /* Merge in the selected order */ + return lo | (((uint64_t)hi) << 32ull); } static unsigned bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count, - uint64_t *merged, unsigned *pcrel_pair) + uint64_t *merged, unsigned *pcrel_pair) { - unsigned merge_count = 0; + unsigned merge_count = 0; - for (unsigned t = 0; t < tuple_count; ++t) { - if (consts[t].constant_count != 2) continue; + for (unsigned t = 0; t < tuple_count; ++t) { + if (consts[t].constant_count != 2) + continue; - unsigned idx = ~0; - uint64_t val = bi_merge_u32(consts[t].constants[0], - consts[t].constants[1], consts[t].pcrel); + unsigned idx = ~0; + uint64_t val = bi_merge_u32(consts[t].constants[0], + consts[t].constants[1], consts[t].pcrel); - /* Skip the pcrel pair if assigned, because if one is assigned, - * this one is not pcrel by uniqueness so it's a mismatch */ - for (unsigned s = 0; s < merge_count; ++s) { - if (merged[s] == val && (*pcrel_pair) != s) { - idx = s; - break; - } - } + /* Skip the pcrel pair if assigned, because if one is assigned, + * this one is not pcrel by uniqueness so it's a mismatch */ + for (unsigned s = 0; s < merge_count; ++s) { + if (merged[s] == val && (*pcrel_pair) != s) { + idx = s; + break; + } + } - if (idx == ~0) { - idx = merge_count++; - merged[idx] = val; + if (idx == ~0) { + idx = merge_count++; + merged[idx] = val; - if (consts[t].pcrel) - (*pcrel_pair) = idx; - } + if (consts[t].pcrel) + (*pcrel_pair) = idx; + } - consts[t].word_idx = idx; - } + consts[t].word_idx = idx; + } - return merge_count; + return merge_count; } static unsigned bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count, - uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair) + uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair) { - bool pending = false, pending_pcrel = false; - uint32_t pending_single = 0; + bool pending = false, pending_pcrel = false; + uint32_t pending_single = 0; - for (unsigned t = 0; t < tuple_count; ++t) { - if (consts[t].constant_count != 1) continue; + for (unsigned t = 0; t < tuple_count; ++t) { + if (consts[t].constant_count != 1) + continue; - uint32_t val = consts[t].constants[0]; - unsigned idx = ~0; + uint32_t val = consts[t].constants[0]; + unsigned idx = ~0; - /* Try to match, but don't match pcrel with non-pcrel, even - * though we can merge a pcrel with a non-pcrel single */ - for (unsigned i = 0; i < pair_count; ++i) { - bool lo = ((pairs[i] & 0xffffffff) == val); - bool hi = ((pairs[i] >> 32) == val); - bool match = (lo || hi); - match &= ((*pcrel_pair) != i); - if (match && !consts[t].pcrel) { - idx = i; - break; - } - } + /* Try to match, but don't match pcrel with non-pcrel, even + * though we can merge a pcrel with a non-pcrel single */ + for (unsigned i = 0; i < pair_count; ++i) { + bool lo = ((pairs[i] & 0xffffffff) == val); + bool hi = ((pairs[i] >> 32) == val); + bool match = (lo || hi); + match &= ((*pcrel_pair) != i); + if (match && !consts[t].pcrel) { + idx = i; + break; + } + } - if (idx == ~0) { - idx = pair_count; + if (idx == ~0) { + idx = pair_count; - if (pending && pending_single != val) { - assert(!(pending_pcrel && consts[t].pcrel)); - bool pcrel = pending_pcrel || consts[t].pcrel; + if (pending && pending_single != val) { + assert(!(pending_pcrel && consts[t].pcrel)); + bool pcrel = pending_pcrel || consts[t].pcrel; - if (pcrel) - *pcrel_pair = idx; + if (pcrel) + *pcrel_pair = idx; - pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel); + pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel); - pending = pending_pcrel = false; - } else { - pending = true; - pending_pcrel = consts[t].pcrel; - pending_single = val; - } - } + pending = pending_pcrel = false; + } else { + pending = true; + pending_pcrel = consts[t].pcrel; + pending_single = val; + } + } - consts[t].word_idx = idx; - } + consts[t].word_idx = idx; + } - /* Shift so it works whether pending_pcrel is set or not */ - if (pending) { - if (pending_pcrel) - *pcrel_pair = pair_count; + /* Shift so it works whether pending_pcrel is set or not */ + if (pending) { + if (pending_pcrel) + *pcrel_pair = pair_count; - pairs[pair_count++] = ((uint64_t) pending_single) << 32ull; - } + pairs[pair_count++] = ((uint64_t)pending_single) << 32ull; + } - return pair_count; + return pair_count; } static unsigned -bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx) +bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, + unsigned *pcrel_idx) { - unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx); - return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx); + unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx); + return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx); } /* Swap two constants at word i and i+1 by swapping their actual positions and @@ -1604,16 +1610,16 @@ bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcr static void bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i) { - uint64_t tmp_pair = pairs[i + 0]; - pairs[i + 0] = pairs[i + 1]; - pairs[i + 1] = tmp_pair; + uint64_t tmp_pair = pairs[i + 0]; + pairs[i + 0] = pairs[i + 1]; + pairs[i + 1] = tmp_pair; - for (unsigned t = 0; t < 8; ++t) { - if (consts[t].word_idx == i) - consts[t].word_idx = (i + 1); - else if (consts[t].word_idx == (i + 1)) - consts[t].word_idx = i; - } + for (unsigned t = 0; t < 8; ++t) { + if (consts[t].word_idx == i) + consts[t].word_idx = (i + 1); + else if (consts[t].word_idx == (i + 1)) + consts[t].word_idx = i; + } } /* Given merged constants, one of which might be PC-relative, fix up the M @@ -1621,449 +1627,456 @@ bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i) * and other constants are used as-is (which might require swapping) */ static unsigned -bi_apply_constant_modifiers(struct bi_const_state *consts, - uint64_t *pairs, unsigned *pcrel_idx, - unsigned tuple_count, unsigned constant_count) +bi_apply_constant_modifiers(struct bi_const_state *consts, uint64_t *pairs, + unsigned *pcrel_idx, unsigned tuple_count, + unsigned constant_count) { - unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0; + unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0; - /* Clauses with these tuple counts lack an M field for the packed EC0, - * so EC0 cannot be PC-relative, which might require swapping (and - * possibly adding an unused constant) to fit */ + /* Clauses with these tuple counts lack an M field for the packed EC0, + * so EC0 cannot be PC-relative, which might require swapping (and + * possibly adding an unused constant) to fit */ - if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) { - constant_count = MAX2(constant_count, 2); - *pcrel_idx = 1; - bi_swap_constants(consts, pairs, 0); - } + if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) { + constant_count = MAX2(constant_count, 2); + *pcrel_idx = 1; + bi_swap_constants(consts, pairs, 0); + } - /* EC0 might be packed free, after that constants are packed in pairs - * (with clause format 12), with M1 values computed from the pair */ + /* EC0 might be packed free, after that constants are packed in pairs + * (with clause format 12), with M1 values computed from the pair */ - for (unsigned i = start; i < constant_count; i += 2) { - bool swap = false; - bool last = (i + 1) == constant_count; + for (unsigned i = start; i < constant_count; i += 2) { + bool swap = false; + bool last = (i + 1) == constant_count; - unsigned A1 = (pairs[i] >> 60); - unsigned B1 = (pairs[i + 1] >> 60); + unsigned A1 = (pairs[i] >> 60); + unsigned B1 = (pairs[i + 1] >> 60); - if (*pcrel_idx == i || *pcrel_idx == (i + 1)) { - /* PC-relative constant must be E0, not E1 */ - swap = (*pcrel_idx == (i + 1)); + if (*pcrel_idx == i || *pcrel_idx == (i + 1)) { + /* PC-relative constant must be E0, not E1 */ + swap = (*pcrel_idx == (i + 1)); - /* Set M1 = 4 by noting (A - B) mod 16 = 4 is - * equivalent to A = (B + 4) mod 16 and that we can - * control A */ - unsigned B = swap ? A1 : B1; - unsigned A = (B + 4) & 0xF; - pairs[*pcrel_idx] |= ((uint64_t) A) << 60; + /* Set M1 = 4 by noting (A - B) mod 16 = 4 is + * equivalent to A = (B + 4) mod 16 and that we can + * control A */ + unsigned B = swap ? A1 : B1; + unsigned A = (B + 4) & 0xF; + pairs[*pcrel_idx] |= ((uint64_t)A) << 60; - /* Swapped if swap set, identity if swap not set */ - *pcrel_idx = i; - } else { - /* Compute M1 value if we don't swap */ - unsigned M1 = (16 + A1 - B1) & 0xF; + /* Swapped if swap set, identity if swap not set */ + *pcrel_idx = i; + } else { + /* Compute M1 value if we don't swap */ + unsigned M1 = (16 + A1 - B1) & 0xF; - /* For M1 = 0 or M1 >= 8, the constants are unchanged, - * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 - - * A1) % 16 >= 8, so swapping will let them be used - * unchanged */ - swap = (M1 != 0) && (M1 < 8); + /* For M1 = 0 or M1 >= 8, the constants are unchanged, + * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 - + * A1) % 16 >= 8, so swapping will let them be used + * unchanged */ + swap = (M1 != 0) && (M1 < 8); - /* However, we can't swap the last constant, so we - * force M1 = 0 instead for this case */ - if (last && swap) { - pairs[i + 1] |= pairs[i] & (0xfull << 60); - swap = false; - } - } + /* However, we can't swap the last constant, so we + * force M1 = 0 instead for this case */ + if (last && swap) { + pairs[i + 1] |= pairs[i] & (0xfull << 60); + swap = false; + } + } - if (swap) { - assert(!last); - bi_swap_constants(consts, pairs, i); - } - } + if (swap) { + assert(!last); + bi_swap_constants(consts, pairs, i); + } + } - return constant_count; + return constant_count; } /* Schedule a single clause. If no instructions remain, return NULL. */ static bi_clause * -bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live) +bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, + uint64_t *live) { - struct bi_clause_state clause_state = { 0 }; - bi_clause *clause = rzalloc(ctx, bi_clause); - bi_tuple *tuple = NULL; + struct bi_clause_state clause_state = {0}; + bi_clause *clause = rzalloc(ctx, bi_clause); + bi_tuple *tuple = NULL; - const unsigned max_tuples = ARRAY_SIZE(clause->tuples); + const unsigned max_tuples = ARRAY_SIZE(clause->tuples); - /* TODO: Decide flow control better */ - clause->flow_control = BIFROST_FLOW_NBTB; + /* TODO: Decide flow control better */ + clause->flow_control = BIFROST_FLOW_NBTB; - /* The last clause can only write one instruction, so initialize that */ - struct bi_reg_state reg_state = {}; - bi_index prev_reads[5] = { bi_null() }; - unsigned nr_prev_reads = 0; + /* The last clause can only write one instruction, so initialize that */ + struct bi_reg_state reg_state = {}; + bi_index prev_reads[5] = {bi_null()}; + unsigned nr_prev_reads = 0; - /* We need to track future liveness. The main *live set tracks what is - * live at the current point int he program we are scheduling, but to - * determine temp eligibility, we instead want what will be live after - * the next tuple in the program. If you scheduled forwards, you'd need - * a crystall ball for this. Luckily we schedule backwards, so we just - * delay updates to the live_after_temp by an extra tuple. */ - uint64_t live_after_temp = *live; - uint64_t live_next_tuple = live_after_temp; + /* We need to track future liveness. The main *live set tracks what is + * live at the current point int he program we are scheduling, but to + * determine temp eligibility, we instead want what will be live after + * the next tuple in the program. If you scheduled forwards, you'd need + * a crystall ball for this. Luckily we schedule backwards, so we just + * delay updates to the live_after_temp by an extra tuple. */ + uint64_t live_after_temp = *live; + uint64_t live_next_tuple = live_after_temp; - do { - struct bi_tuple_state tuple_state = { - .last = (clause->tuple_count == 0), - .reg = reg_state, - .nr_prev_reads = nr_prev_reads, - .prev = tuple, - .pcrel_idx = ~0, - }; + do { + struct bi_tuple_state tuple_state = { + .last = (clause->tuple_count == 0), + .reg = reg_state, + .nr_prev_reads = nr_prev_reads, + .prev = tuple, + .pcrel_idx = ~0, + }; - assert(nr_prev_reads < ARRAY_SIZE(prev_reads)); - memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads)); + assert(nr_prev_reads < ARRAY_SIZE(prev_reads)); + memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads)); - unsigned idx = max_tuples - clause->tuple_count - 1; + unsigned idx = max_tuples - clause->tuple_count - 1; - tuple = &clause->tuples[idx]; + tuple = &clause->tuples[idx]; - if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) { - unsigned nr = bi_count_read_registers(clause->message, 0); - live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value); - } + if (clause->message && bi_opcode_props[clause->message->op].sr_read && + !bi_is_null(clause->message->src[0])) { + unsigned nr = bi_count_read_registers(clause->message, 0); + live_after_temp |= + (BITFIELD64_MASK(nr) << clause->message->src[0].value); + } - /* Since we schedule backwards, we schedule ADD first */ - tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false); - tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true); - tuple->add = tuple_state.add; + /* Since we schedule backwards, we schedule ADD first */ + tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, + live_after_temp, false); + tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, + live_after_temp, true); + tuple->add = tuple_state.add; - /* Update liveness from the new instructions */ - if (tuple->add) - *live = bi_postra_liveness_ins(*live, tuple->add); + /* Update liveness from the new instructions */ + if (tuple->add) + *live = bi_postra_liveness_ins(*live, tuple->add); - if (tuple->fma) - *live = bi_postra_liveness_ins(*live, tuple->fma); + if (tuple->fma) + *live = bi_postra_liveness_ins(*live, tuple->fma); - /* Rotate in the new per-tuple liveness */ - live_after_temp = live_next_tuple; - live_next_tuple = *live; + /* Rotate in the new per-tuple liveness */ + live_after_temp = live_next_tuple; + live_next_tuple = *live; - /* We may have a message, but only one per clause */ - if (tuple->add && bi_must_message(tuple->add)) { - assert(!clause_state.message); - clause_state.message = true; + /* We may have a message, but only one per clause */ + if (tuple->add && bi_must_message(tuple->add)) { + assert(!clause_state.message); + clause_state.message = true; - clause->message_type = - bi_message_type_for_instr(tuple->add); - clause->message = tuple->add; + clause->message_type = bi_message_type_for_instr(tuple->add); + clause->message = tuple->add; - /* We don't need to set dependencies for blend shaders - * because the BLEND instruction in the fragment - * shader should have already done the wait */ - if (!ctx->inputs->is_blend) { - switch (tuple->add->op) { - case BI_OPCODE_ATEST: - clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH); - break; - case BI_OPCODE_LD_TILE: - case BI_OPCODE_ST_TILE: - clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR); - break; - case BI_OPCODE_BLEND: - clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH); - clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR); - break; - default: - break; - } - } - } + /* We don't need to set dependencies for blend shaders + * because the BLEND instruction in the fragment + * shader should have already done the wait */ + if (!ctx->inputs->is_blend) { + switch (tuple->add->op) { + case BI_OPCODE_ATEST: + clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH); + break; + case BI_OPCODE_LD_TILE: + case BI_OPCODE_ST_TILE: + clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR); + break; + case BI_OPCODE_BLEND: + clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH); + clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR); + break; + default: + break; + } + } + } - clause_state.consts[idx] = bi_get_const_state(&tuple_state); + clause_state.consts[idx] = bi_get_const_state(&tuple_state); - /* Before merging constants, eliminate zeroes, otherwise the - * merging will fight over the #0 that never gets read (and is - * never marked as read by update_fau) */ - if (tuple->fma && bi_reads_zero(tuple->fma)) - bi_rewrite_zero(tuple->fma, true); + /* Before merging constants, eliminate zeroes, otherwise the + * merging will fight over the #0 that never gets read (and is + * never marked as read by update_fau) */ + if (tuple->fma && bi_reads_zero(tuple->fma)) + bi_rewrite_zero(tuple->fma, true); - /* Rewrite away FAU, constant write is deferred */ - if (!tuple_state.constant_count) { - tuple->fau_idx = tuple_state.fau; - bi_rewrite_fau_to_pass(tuple); - } + /* Rewrite away FAU, constant write is deferred */ + if (!tuple_state.constant_count) { + tuple->fau_idx = tuple_state.fau; + bi_rewrite_fau_to_pass(tuple); + } - /* Use passthrough register for cross-stage accesses. Since - * there are just FMA and ADD stages, that means we rewrite to - * passthrough the sources of the ADD that read from the - * destination of the FMA */ + /* Use passthrough register for cross-stage accesses. Since + * there are just FMA and ADD stages, that means we rewrite to + * passthrough the sources of the ADD that read from the + * destination of the FMA */ - if (tuple->fma && tuple->fma->nr_dests) { - bi_use_passthrough(tuple->add, tuple->fma->dest[0], - BIFROST_SRC_STAGE, false); - } + if (tuple->fma && tuple->fma->nr_dests) { + bi_use_passthrough(tuple->add, tuple->fma->dest[0], BIFROST_SRC_STAGE, + false); + } - /* Don't add an empty tuple, unless the worklist has nothing - * but a (pseudo)instruction failing to schedule due to a "not - * last instruction" constraint */ + /* Don't add an empty tuple, unless the worklist has nothing + * but a (pseudo)instruction failing to schedule due to a "not + * last instruction" constraint */ - int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count)); - bool not_last = (some_instruction > 0) && - bi_must_not_last(st.instructions[some_instruction - 1]); + int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count)); + bool not_last = (some_instruction > 0) && + bi_must_not_last(st.instructions[some_instruction - 1]); - bool insert_empty = tuple_state.last && not_last; + bool insert_empty = tuple_state.last && not_last; - if (!(tuple->fma || tuple->add || insert_empty)) - break; + if (!(tuple->fma || tuple->add || insert_empty)) + break; - clause->tuple_count++; + clause->tuple_count++; - /* Adding enough tuple might overflow constants */ - if (!bi_space_for_more_constants(&clause_state)) - break; + /* Adding enough tuple might overflow constants */ + if (!bi_space_for_more_constants(&clause_state)) + break; #ifndef NDEBUG - /* Don't schedule more than 1 tuple if debugging */ - if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty) - break; + /* Don't schedule more than 1 tuple if debugging */ + if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty) + break; #endif - /* Link through the register state */ - STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads)); - memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads)); - nr_prev_reads = tuple_state.reg.nr_reads; - clause_state.tuple_count++; - } while(clause->tuple_count < 8); + /* Link through the register state */ + STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads)); + memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads)); + nr_prev_reads = tuple_state.reg.nr_reads; + clause_state.tuple_count++; + } while (clause->tuple_count < 8); - /* Don't schedule an empty clause */ - if (!clause->tuple_count) - return NULL; + /* Don't schedule an empty clause */ + if (!clause->tuple_count) + return NULL; - /* Before merging, rewrite away any tuples that read only zero */ - for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) { - bi_tuple *tuple = &clause->tuples[i]; - struct bi_const_state *st = &clause_state.consts[i]; + /* Before merging, rewrite away any tuples that read only zero */ + for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) { + bi_tuple *tuple = &clause->tuples[i]; + struct bi_const_state *st = &clause_state.consts[i]; - if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel) - continue; + if (st->constant_count == 0 || st->constants[0] || st->constants[1] || + st->pcrel) + continue; - bi_foreach_instr_in_tuple(tuple, ins) - bi_rewrite_zero(ins, false); + bi_foreach_instr_in_tuple(tuple, ins) + bi_rewrite_zero(ins, false); - /* Constant has been demoted to FAU, so don't pack it separately */ - st->constant_count = 0; + /* Constant has been demoted to FAU, so don't pack it separately */ + st->constant_count = 0; - /* Default */ - assert(tuple->fau_idx == BIR_FAU_ZERO); - } + /* Default */ + assert(tuple->fau_idx == BIR_FAU_ZERO); + } - uint64_t constant_pairs[8] = { 0 }; - unsigned pcrel_idx = ~0; - unsigned constant_words = - bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx); + uint64_t constant_pairs[8] = {0}; + unsigned pcrel_idx = ~0; + unsigned constant_words = + bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx); - constant_words = bi_apply_constant_modifiers(clause_state.consts, - constant_pairs, &pcrel_idx, clause->tuple_count, - constant_words); + constant_words = bi_apply_constant_modifiers( + clause_state.consts, constant_pairs, &pcrel_idx, clause->tuple_count, + constant_words); - clause->pcrel_idx = pcrel_idx; + clause->pcrel_idx = pcrel_idx; - for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) { - bi_tuple *tuple = &clause->tuples[i]; + for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) { + bi_tuple *tuple = &clause->tuples[i]; - /* If no constants, leave FAU as it is, possibly defaulting to 0 */ - if (clause_state.consts[i].constant_count == 0) - continue; + /* If no constants, leave FAU as it is, possibly defaulting to 0 */ + if (clause_state.consts[i].constant_count == 0) + continue; - /* FAU is already handled */ - assert(!tuple->fau_idx); + /* FAU is already handled */ + assert(!tuple->fau_idx); - unsigned word_idx = clause_state.consts[i].word_idx; - assert(word_idx <= 8); + unsigned word_idx = clause_state.consts[i].word_idx; + assert(word_idx <= 8); - /* We could try to merge regardless of bottom bits as well, but - * that's probably diminishing returns */ - uint64_t pair = constant_pairs[word_idx]; - unsigned lo = pair & 0xF; + /* We could try to merge regardless of bottom bits as well, but + * that's probably diminishing returns */ + uint64_t pair = constant_pairs[word_idx]; + unsigned lo = pair & 0xF; - tuple->fau_idx = bi_constant_field(word_idx) | lo; - bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx); - } + tuple->fau_idx = bi_constant_field(word_idx) | lo; + bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx); + } - clause->constant_count = constant_words; - memcpy(clause->constants, constant_pairs, sizeof(constant_pairs)); + clause->constant_count = constant_words; + memcpy(clause->constants, constant_pairs, sizeof(constant_pairs)); - /* Branches must be last, so this can be factored out */ - bi_instr *last = clause->tuples[max_tuples - 1].add; - clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP); - clause->block = block; + /* Branches must be last, so this can be factored out */ + bi_instr *last = clause->tuples[max_tuples - 1].add; + clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP); + clause->block = block; - clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE); + clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE); - /* We emit in reverse and emitted to the back of the tuples array, so - * move it up front for easy indexing */ - memmove(clause->tuples, - clause->tuples + (max_tuples - clause->tuple_count), - clause->tuple_count * sizeof(clause->tuples[0])); + /* We emit in reverse and emitted to the back of the tuples array, so + * move it up front for easy indexing */ + memmove(clause->tuples, clause->tuples + (max_tuples - clause->tuple_count), + clause->tuple_count * sizeof(clause->tuples[0])); - /* Use passthrough register for cross-tuple accesses. Note this is - * after the memmove, so this is forwards. Skip the first tuple since - * there is nothing before it to passthrough */ + /* Use passthrough register for cross-tuple accesses. Note this is + * after the memmove, so this is forwards. Skip the first tuple since + * there is nothing before it to passthrough */ - for (unsigned t = 1; t < clause->tuple_count; ++t) - bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]); + for (unsigned t = 1; t < clause->tuple_count; ++t) + bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]); - return clause; + return clause; } static void bi_schedule_block(bi_context *ctx, bi_block *block) { - list_inithead(&block->clauses); + list_inithead(&block->clauses); - /* Copy list to dynamic array */ - struct bi_worklist st = bi_initialize_worklist(block, - bifrost_debug & BIFROST_DBG_INORDER, - ctx->inputs->is_blend); + /* Copy list to dynamic array */ + struct bi_worklist st = bi_initialize_worklist( + block, bifrost_debug & BIFROST_DBG_INORDER, ctx->inputs->is_blend); - if (!st.count) { - bi_free_worklist(st); - return; - } + if (!st.count) { + bi_free_worklist(st); + return; + } - /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */ - uint64_t live = block->reg_live_out; + /* We need to track liveness during scheduling in order to determine whether + * we can use temporary (passthrough) registers */ + uint64_t live = block->reg_live_out; - /* Schedule as many clauses as needed to fill the block */ - bi_clause *u = NULL; - while((u = bi_schedule_clause(ctx, block, st, &live))) - list_add(&u->link, &block->clauses); + /* Schedule as many clauses as needed to fill the block */ + bi_clause *u = NULL; + while ((u = bi_schedule_clause(ctx, block, st, &live))) + list_add(&u->link, &block->clauses); - /* Back-to-back bit affects only the last clause of a block, - * the rest are implicitly true */ - if (!list_is_empty(&block->clauses)) { - bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link); - if (bi_reconverge_branches(block)) - last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL; - } + /* Back-to-back bit affects only the last clause of a block, + * the rest are implicitly true */ + if (!list_is_empty(&block->clauses)) { + bi_clause *last_clause = + list_last_entry(&block->clauses, bi_clause, link); + if (bi_reconverge_branches(block)) + last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL; + } - /* Reorder instructions to match the new schedule. First remove - * existing instructions and then recreate the list */ + /* Reorder instructions to match the new schedule. First remove + * existing instructions and then recreate the list */ - bi_foreach_instr_in_block_safe(block, ins) { - list_del(&ins->link); - } + bi_foreach_instr_in_block_safe(block, ins) { + list_del(&ins->link); + } - bi_foreach_clause_in_block(block, clause) { - for (unsigned i = 0; i < clause->tuple_count; ++i) { - bi_foreach_instr_in_tuple(&clause->tuples[i], ins) { - list_addtail(&ins->link, &block->instructions); - } - } - } + bi_foreach_clause_in_block(block, clause) { + for (unsigned i = 0; i < clause->tuple_count; ++i) { + bi_foreach_instr_in_tuple(&clause->tuples[i], ins) { + list_addtail(&ins->link, &block->instructions); + } + } + } - block->scheduled = true; + block->scheduled = true; #ifndef NDEBUG - unsigned i; - bool incomplete = false; + unsigned i; + bool incomplete = false; - BITSET_FOREACH_SET(i, st.worklist, st.count) { - bi_print_instr(st.instructions[i], stderr); - incomplete = true; - } + BITSET_FOREACH_SET(i, st.worklist, st.count) { + bi_print_instr(st.instructions[i], stderr); + incomplete = true; + } - if (incomplete) - unreachable("The above instructions failed to schedule."); + if (incomplete) + unreachable("The above instructions failed to schedule."); #endif - bi_free_worklist(st); + bi_free_worklist(st); } static bool -bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau) +bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, + unsigned *cwords, bi_index *fau) { - assert(s < ins->nr_srcs); - bi_index src = ins->src[s]; + assert(s < ins->nr_srcs); + bi_index src = ins->src[s]; - /* Staging registers can't have FAU accesses */ - if (bi_is_staging_src(ins, s)) - return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU); + /* Staging registers can't have FAU accesses */ + if (bi_is_staging_src(ins, s)) + return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU); - if (src.type == BI_INDEX_CONSTANT) { - /* Allow fast zero */ - if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins)) - return true; + if (src.type == BI_INDEX_CONSTANT) { + /* Allow fast zero */ + if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins)) + return true; - if (!bi_is_null(*fau)) - return false; + if (!bi_is_null(*fau)) + return false; - /* Else, try to inline a constant */ - for (unsigned i = 0; i < *cwords; ++i) { - if (src.value == constants[i]) - return true; - } + /* Else, try to inline a constant */ + for (unsigned i = 0; i < *cwords; ++i) { + if (src.value == constants[i]) + return true; + } - if (*cwords >= 2) - return false; + if (*cwords >= 2) + return false; - constants[(*cwords)++] = src.value; - } else if (src.type == BI_INDEX_FAU) { - if (*cwords != 0) - return false; + constants[(*cwords)++] = src.value; + } else if (src.type == BI_INDEX_FAU) { + if (*cwords != 0) + return false; - /* Can only read from one pair of FAU words */ - if (!bi_is_null(*fau) && (src.value != fau->value)) - return false; + /* Can only read from one pair of FAU words */ + if (!bi_is_null(*fau) && (src.value != fau->value)) + return false; - /* If there is a target, we'll need a PC-relative constant */ - if (ins->branch_target) - return false; + /* If there is a target, we'll need a PC-relative constant */ + if (ins->branch_target) + return false; - *fau = src; - } + *fau = src; + } - return true; + return true; } void bi_lower_fau(bi_context *ctx) { - bi_foreach_instr_global_safe(ctx, ins) { - bi_builder b = bi_init_builder(ctx, bi_before_instr(ins)); + bi_foreach_instr_global_safe(ctx, ins) { + bi_builder b = bi_init_builder(ctx, bi_before_instr(ins)); - uint32_t constants[2]; - unsigned cwords = 0; - bi_index fau = bi_null(); + uint32_t constants[2]; + unsigned cwords = 0; + bi_index fau = bi_null(); - /* ATEST must have the ATEST datum encoded, not any other - * uniform. See to it this is the case. */ - if (ins->op == BI_OPCODE_ATEST) - fau = ins->src[2]; + /* ATEST must have the ATEST datum encoded, not any other + * uniform. See to it this is the case. */ + if (ins->op == BI_OPCODE_ATEST) + fau = ins->src[2]; - /* Dual texturing requires the texture operation descriptor - * encoded as an immediate so we can fix up. - */ - if (ins->op == BI_OPCODE_TEXC_DUAL) { - assert(ins->src[3].type == BI_INDEX_CONSTANT); - constants[cwords++] = ins->src[3].value; - } + /* Dual texturing requires the texture operation descriptor + * encoded as an immediate so we can fix up. + */ + if (ins->op == BI_OPCODE_TEXC_DUAL) { + assert(ins->src[3].type == BI_INDEX_CONSTANT); + constants[cwords++] = ins->src[3].value; + } - /* Phis get split up into moves so are unrestricted */ - if (ins->op == BI_OPCODE_PHI) - continue; + /* Phis get split up into moves so are unrestricted */ + if (ins->op == BI_OPCODE_PHI) + continue; - bi_foreach_src(ins, s) { - if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue; + bi_foreach_src(ins, s) { + if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) + continue; - bi_index copy = bi_mov_i32(&b, ins->src[s]); - bi_replace_src(ins, s, copy); - } - } + bi_index copy = bi_mov_i32(&b, ins->src[s]); + bi_replace_src(ins, s, copy); + } + } } /* Only v7 allows specifying a dependency on the tilebuffer for the first @@ -2072,50 +2085,53 @@ bi_lower_fau(bi_context *ctx) static void bi_add_nop_for_atest(bi_context *ctx) { - /* Only needed on v6 */ - if (ctx->arch >= 7) - return; + /* Only needed on v6 */ + if (ctx->arch >= 7) + return; - if (list_is_empty(&ctx->blocks)) - return; + if (list_is_empty(&ctx->blocks)) + return; - /* Fetch the first clause of the shader */ - bi_block *block = list_first_entry(&ctx->blocks, bi_block, link); - bi_clause *clause = bi_next_clause(ctx, block, NULL); + /* Fetch the first clause of the shader */ + bi_block *block = list_first_entry(&ctx->blocks, bi_block, link); + bi_clause *clause = bi_next_clause(ctx, block, NULL); - if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) | - (1 << BIFROST_SLOT_ELDEST_COLOUR)))) - return; + if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) | + (1 << BIFROST_SLOT_ELDEST_COLOUR)))) + return; - /* Add a NOP so we can wait for the dependencies required by the first - * clause */ + /* Add a NOP so we can wait for the dependencies required by the first + * clause */ - bi_instr *I = rzalloc(ctx, bi_instr); - I->op = BI_OPCODE_NOP; + bi_instr *I = rzalloc(ctx, bi_instr); + I->op = BI_OPCODE_NOP; - bi_clause *new_clause = ralloc(ctx, bi_clause); - *new_clause = (bi_clause) { - .flow_control = BIFROST_FLOW_NBTB, - .next_clause_prefetch = true, - .block = clause->block, + bi_clause *new_clause = ralloc(ctx, bi_clause); + *new_clause = (bi_clause){ + .flow_control = BIFROST_FLOW_NBTB, + .next_clause_prefetch = true, + .block = clause->block, - .tuple_count = 1, - .tuples[0] = { .fma = I, }, - }; + .tuple_count = 1, + .tuples[0] = + { + .fma = I, + }, + }; - list_add(&new_clause->link, &clause->block->clauses); + list_add(&new_clause->link, &clause->block->clauses); } void bi_schedule(bi_context *ctx) { - /* Fed into both scheduling and DCE */ - bi_postra_liveness(ctx); + /* Fed into both scheduling and DCE */ + bi_postra_liveness(ctx); - bi_foreach_block(ctx, block) { - bi_schedule_block(ctx, block); - } + bi_foreach_block(ctx, block) { + bi_schedule_block(ctx, block); + } - bi_opt_dce_post_ra(ctx); - bi_add_nop_for_atest(ctx); + bi_opt_dce_post_ra(ctx); + bi_add_nop_for_atest(ctx); } diff --git a/src/panfrost/bifrost/bi_scoreboard.c b/src/panfrost/bifrost/bi_scoreboard.c index 04aa07b0c1f..735bcf4a677 100644 --- a/src/panfrost/bifrost/bi_scoreboard.c +++ b/src/panfrost/bifrost/bi_scoreboard.c @@ -54,9 +54,9 @@ */ #define BI_NUM_GENERAL_SLOTS 6 -#define BI_NUM_SLOTS 8 -#define BI_NUM_REGISTERS 64 -#define BI_SLOT_SERIAL 0 /* arbitrary */ +#define BI_NUM_SLOTS 8 +#define BI_NUM_REGISTERS 64 +#define BI_SLOT_SERIAL 0 /* arbitrary */ /* * Due to the crude scoreboarding we do, we need to serialize varying loads and @@ -65,26 +65,26 @@ static bool bi_should_serialize(bi_instr *I) { - /* For debug, serialize everything to disable scoreboard opts */ - if (bifrost_debug & BIFROST_DBG_NOSB) - return true; + /* For debug, serialize everything to disable scoreboard opts */ + if (bifrost_debug & BIFROST_DBG_NOSB) + return true; - /* Although nominally on the attribute unit, image loads have the same - * coherency requirements as general memory loads. Serialize them for - * now until we can do something more clever. - */ - if (I->op == BI_OPCODE_LD_ATTR_TEX) - return true; + /* Although nominally on the attribute unit, image loads have the same + * coherency requirements as general memory loads. Serialize them for + * now until we can do something more clever. + */ + if (I->op == BI_OPCODE_LD_ATTR_TEX) + return true; - switch (bi_opcode_props[I->op].message) { - case BIFROST_MESSAGE_VARYING: - case BIFROST_MESSAGE_LOAD: - case BIFROST_MESSAGE_STORE: - case BIFROST_MESSAGE_ATOMIC: - return true; - default: - return false; - } + switch (bi_opcode_props[I->op].message) { + case BIFROST_MESSAGE_VARYING: + case BIFROST_MESSAGE_LOAD: + case BIFROST_MESSAGE_STORE: + case BIFROST_MESSAGE_ATOMIC: + return true; + default: + return false; + } } /* Given a scoreboard model, choose a slot for a clause wrapping a given @@ -93,76 +93,77 @@ bi_should_serialize(bi_instr *I) static unsigned bi_choose_scoreboard_slot(bi_instr *message) { - /* ATEST, ZS_EMIT must be issued with slot #0 */ - if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT) - return 0; + /* ATEST, ZS_EMIT must be issued with slot #0 */ + if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT) + return 0; - /* BARRIER must be issued with slot #7 */ - if (message->op == BI_OPCODE_BARRIER) - return 7; + /* BARRIER must be issued with slot #7 */ + if (message->op == BI_OPCODE_BARRIER) + return 7; - /* For now, make serialization is easy */ - if (bi_should_serialize(message)) - return BI_SLOT_SERIAL; + /* For now, make serialization is easy */ + if (bi_should_serialize(message)) + return BI_SLOT_SERIAL; - return 0; + return 0; } static uint64_t bi_read_mask(bi_instr *I, bool staging_only) { - uint64_t mask = 0; + uint64_t mask = 0; - if (staging_only && !bi_opcode_props[I->op].sr_read) - return mask; + if (staging_only && !bi_opcode_props[I->op].sr_read) + return mask; - bi_foreach_src(I, s) { - if (I->src[s].type == BI_INDEX_REGISTER) { - unsigned reg = I->src[s].value; - unsigned count = bi_count_read_registers(I, s); + bi_foreach_src(I, s) { + if (I->src[s].type == BI_INDEX_REGISTER) { + unsigned reg = I->src[s].value; + unsigned count = bi_count_read_registers(I, s); - mask |= (BITFIELD64_MASK(count) << reg); - } + mask |= (BITFIELD64_MASK(count) << reg); + } - if (staging_only) - break; - } + if (staging_only) + break; + } - return mask; + return mask; } static uint64_t bi_write_mask(bi_instr *I) { - uint64_t mask = 0; + uint64_t mask = 0; - bi_foreach_dest(I, d) { - if (bi_is_null(I->dest[d])) continue; + bi_foreach_dest(I, d) { + if (bi_is_null(I->dest[d])) + continue; - assert(I->dest[d].type == BI_INDEX_REGISTER); + assert(I->dest[d].type == BI_INDEX_REGISTER); - unsigned reg = I->dest[d].value; - unsigned count = bi_count_write_registers(I, d); + unsigned reg = I->dest[d].value; + unsigned count = bi_count_write_registers(I, d); - mask |= (BITFIELD64_MASK(count) << reg); - } + mask |= (BITFIELD64_MASK(count) << reg); + } - /* Instructions like AXCHG.i32 unconditionally both read and write - * staging registers. Even if we discard the result, the write still - * happens logically and needs to be included in our calculations. - * Obscurely, ATOM_CX is sr_write but can ignore the staging register in - * certain circumstances; this does not require consideration. - */ - if (bi_opcode_props[I->op].sr_write && I->nr_dests && I->nr_srcs && - bi_is_null(I->dest[0]) && !bi_is_null(I->src[0])) { + /* Instructions like AXCHG.i32 unconditionally both read and write + * staging registers. Even if we discard the result, the write still + * happens logically and needs to be included in our calculations. + * Obscurely, ATOM_CX is sr_write but can ignore the staging register in + * certain circumstances; this does not require consideration. + */ + if (bi_opcode_props[I->op].sr_write && I->nr_dests && I->nr_srcs && + bi_is_null(I->dest[0]) && !bi_is_null(I->src[0])) { - unsigned reg = I->src[0].value; - unsigned count = bi_count_write_registers(I, 0); + unsigned reg = I->src[0].value; + unsigned count = bi_count_write_registers(I, 0); - mask |= (BITFIELD64_MASK(count) << reg); - } + mask |= (BITFIELD64_MASK(count) << reg); + } - return mask; + return mask; } /* Update the scoreboard model to assign an instruction to a given slot */ @@ -170,140 +171,143 @@ bi_write_mask(bi_instr *I) static void bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause) { - bi_instr *I = clause->message; - unsigned slot = clause->scoreboard_id; + bi_instr *I = clause->message; + unsigned slot = clause->scoreboard_id; - if (!I) - return; + if (!I) + return; - st->read[slot] |= bi_read_mask(I, true); + st->read[slot] |= bi_read_mask(I, true); - if (bi_opcode_props[I->op].sr_write) - st->write[slot] |= bi_write_mask(I); + if (bi_opcode_props[I->op].sr_write) + st->write[slot] |= bi_write_mask(I); } /* Adds a dependency on each slot writing any specified register */ static void -bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask) +bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, + uint64_t regmask) { - for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) { - if (!(st->write[slot] & regmask)) - continue; + for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) { + if (!(st->write[slot] & regmask)) + continue; - st->write[slot] = 0; - st->read[slot] = 0; + st->write[slot] = 0; + st->read[slot] = 0; - clause->dependencies |= BITFIELD_BIT(slot); - } + clause->dependencies |= BITFIELD_BIT(slot); + } } static void -bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask) +bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, + uint64_t regmask) { - for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) { - if (!(st->read[slot] & regmask)) - continue; + for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) { + if (!(st->read[slot] & regmask)) + continue; - st->read[slot] = 0; - clause->staging_barrier = true; - } + st->read[slot] = 0; + clause->staging_barrier = true; + } } /* Sets the dependencies for a given clause, updating the model */ static void -bi_set_dependencies(bi_block *block, bi_clause *clause, struct bi_scoreboard_state *st) +bi_set_dependencies(bi_block *block, bi_clause *clause, + struct bi_scoreboard_state *st) { - bi_foreach_instr_in_clause(block, clause, I) { - uint64_t read = bi_read_mask(I, false); - uint64_t written = bi_write_mask(I); + bi_foreach_instr_in_clause(block, clause, I) { + uint64_t read = bi_read_mask(I, false); + uint64_t written = bi_write_mask(I); - /* Read-after-write; write-after-write */ - bi_depend_on_writers(clause, st, read | written); + /* Read-after-write; write-after-write */ + bi_depend_on_writers(clause, st, read | written); - /* Write-after-read */ - bi_set_staging_barrier(clause, st, written); - } + /* Write-after-read */ + bi_set_staging_barrier(clause, st, written); + } - /* LD_VAR instructions must be serialized per-quad. Just always depend - * on any LD_VAR instructions. This isn't optimal, but doing better - * requires divergence-aware data flow analysis. - * - * Similarly, memory loads/stores need to be synchronized. For now, - * force them to be serialized. This is not optimal. - */ - if (clause->message && bi_should_serialize(clause->message)) - clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL); + /* LD_VAR instructions must be serialized per-quad. Just always depend + * on any LD_VAR instructions. This isn't optimal, but doing better + * requires divergence-aware data flow analysis. + * + * Similarly, memory loads/stores need to be synchronized. For now, + * force them to be serialized. This is not optimal. + */ + if (clause->message && bi_should_serialize(clause->message)) + clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL); - /* Barriers must wait on all slots to flush existing work. It might be - * possible to skip this with more information about the barrier. For - * now, be conservative. - */ - if (clause->message && clause->message->op == BI_OPCODE_BARRIER) - clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS); + /* Barriers must wait on all slots to flush existing work. It might be + * possible to skip this with more information about the barrier. For + * now, be conservative. + */ + if (clause->message && clause->message->op == BI_OPCODE_BARRIER) + clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS); } static bool scoreboard_block_update(bi_block *blk) { - bool progress = false; + bool progress = false; - /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */ - bi_foreach_predecessor(blk, pred) { - for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) { - blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i]; - blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i]; - } - } + /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */ + bi_foreach_predecessor(blk, pred) { + for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) { + blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i]; + blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i]; + } + } - struct bi_scoreboard_state state = blk->scoreboard_in; + struct bi_scoreboard_state state = blk->scoreboard_in; - /* Assign locally */ + /* Assign locally */ - bi_foreach_clause_in_block(blk, clause) { - bi_set_dependencies(blk, clause, &state); - bi_push_clause(&state, clause); - } + bi_foreach_clause_in_block(blk, clause) { + bi_set_dependencies(blk, clause, &state); + bi_push_clause(&state, clause); + } - /* To figure out progress, diff scoreboard_out */ + /* To figure out progress, diff scoreboard_out */ - for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) - progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state)); + for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) + progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state)); - blk->scoreboard_out = state; + blk->scoreboard_out = state; - return progress; + return progress; } void bi_assign_scoreboard(bi_context *ctx) { - u_worklist worklist; - bi_worklist_init(ctx, &worklist); + u_worklist worklist; + bi_worklist_init(ctx, &worklist); - /* First, assign slots. */ - bi_foreach_block(ctx, block) { - bi_foreach_clause_in_block(block, clause) { - if (clause->message) { - unsigned slot = bi_choose_scoreboard_slot(clause->message); - clause->scoreboard_id = slot; - } - } + /* First, assign slots. */ + bi_foreach_block(ctx, block) { + bi_foreach_clause_in_block(block, clause) { + if (clause->message) { + unsigned slot = bi_choose_scoreboard_slot(clause->message); + clause->scoreboard_id = slot; + } + } - bi_worklist_push_tail(&worklist, block); - } + bi_worklist_push_tail(&worklist, block); + } - /* Next, perform forward data flow analysis to calculate dependencies */ - while (!u_worklist_is_empty(&worklist)) { - /* Pop from the front for forward analysis */ - bi_block *blk = bi_worklist_pop_head(&worklist); + /* Next, perform forward data flow analysis to calculate dependencies */ + while (!u_worklist_is_empty(&worklist)) { + /* Pop from the front for forward analysis */ + bi_block *blk = bi_worklist_pop_head(&worklist); - if (scoreboard_block_update(blk)) { - bi_foreach_successor(blk, succ) - bi_worklist_push_tail(&worklist, succ); - } - } + if (scoreboard_block_update(blk)) { + bi_foreach_successor(blk, succ) + bi_worklist_push_tail(&worklist, succ); + } + } - u_worklist_fini(&worklist); + u_worklist_fini(&worklist); } diff --git a/src/panfrost/bifrost/bi_test.h b/src/panfrost/bifrost/bi_test.h index 4b54944c0c4..00247c10e25 100644 --- a/src/panfrost/bifrost/bi_test.h +++ b/src/panfrost/bifrost/bi_test.h @@ -27,38 +27,38 @@ #ifndef __BI_TEST_H #define __BI_TEST_H -#include #include +#include #include "compiler.h" /* Helper to generate a bi_builder suitable for creating test instructions */ static inline bi_block * bit_block(bi_context *ctx) { - bi_block *blk = rzalloc(ctx, bi_block); + bi_block *blk = rzalloc(ctx, bi_block); - util_dynarray_init(&blk->predecessors, blk); - list_addtail(&blk->link, &ctx->blocks); - list_inithead(&blk->instructions); + util_dynarray_init(&blk->predecessors, blk); + list_addtail(&blk->link, &ctx->blocks); + list_inithead(&blk->instructions); - blk->index = ctx->num_blocks++; + blk->index = ctx->num_blocks++; - return blk; + return blk; } static inline bi_builder * bit_builder(void *memctx) { - bi_context *ctx = rzalloc(memctx, bi_context); - list_inithead(&ctx->blocks); - ctx->inputs = rzalloc(memctx, struct panfrost_compile_inputs); + bi_context *ctx = rzalloc(memctx, bi_context); + list_inithead(&ctx->blocks); + ctx->inputs = rzalloc(memctx, struct panfrost_compile_inputs); - bi_block *blk = bit_block(ctx); + bi_block *blk = bit_block(ctx); - bi_builder *b = rzalloc(memctx, bi_builder); - b->shader = ctx; - b->cursor = bi_after_block(blk); - return b; + bi_builder *b = rzalloc(memctx, bi_builder); + b->shader = ctx; + b->cursor = bi_after_block(blk); + return b; } /* Helper to compare for logical equality of instructions. Need to skip over @@ -69,14 +69,15 @@ bit_instr_equal(bi_instr *A, bi_instr *B) { size_t skip = sizeof(struct list_head) + 2 * sizeof(bi_index *); - if (memcmp((uint8_t *) A + skip, (uint8_t *) B + skip, sizeof(bi_instr) - skip)) - return false; + if (memcmp((uint8_t *)A + skip, (uint8_t *)B + skip, + sizeof(bi_instr) - skip)) + return false; if (memcmp(A->dest, B->dest, sizeof(bi_index) * A->nr_dests)) - return false; + return false; if (memcmp(A->src, B->src, sizeof(bi_index) * A->nr_srcs)) - return false; + return false; return true; } @@ -87,8 +88,9 @@ bit_block_equal(bi_block *A, bi_block *B) if (list_length(&A->instructions) != list_length(&B->instructions)) return false; - list_pair_for_each_entry(bi_instr, insA, insB, - &A->instructions, &B->instructions, link) { + list_pair_for_each_entry(bi_instr, insA, insB, &A->instructions, + &B->instructions, link) + { if (!bit_instr_equal(insA, insB)) return false; } @@ -102,8 +104,9 @@ bit_shader_equal(bi_context *A, bi_context *B) if (list_length(&A->blocks) != list_length(&B->blocks)) return false; - list_pair_for_each_entry(bi_block, blockA, blockB, - &A->blocks, &B->blocks, link) { + list_pair_for_each_entry(bi_block, blockA, blockB, &A->blocks, &B->blocks, + link) + { if (!bit_block_equal(blockA, blockB)) return false; } @@ -111,30 +114,31 @@ bit_shader_equal(bi_context *A, bi_context *B) return true; } -#define ASSERT_SHADER_EQUAL(A, B) \ - if (!bit_shader_equal(A, B)) { \ - ADD_FAILURE(); \ - fprintf(stderr, "Pass produced unexpected results"); \ - fprintf(stderr, " Actual:\n"); \ - bi_print_shader(A, stderr); \ - fprintf(stderr, " Expected:\n"); \ - bi_print_shader(B, stderr); \ - fprintf(stderr, "\n"); \ - } \ +#define ASSERT_SHADER_EQUAL(A, B) \ + if (!bit_shader_equal(A, B)) { \ + ADD_FAILURE(); \ + fprintf(stderr, "Pass produced unexpected results"); \ + fprintf(stderr, " Actual:\n"); \ + bi_print_shader(A, stderr); \ + fprintf(stderr, " Expected:\n"); \ + bi_print_shader(B, stderr); \ + fprintf(stderr, "\n"); \ + } -#define INSTRUCTION_CASE(instr, expected, pass) do { \ - bi_builder *A = bit_builder(mem_ctx); \ - bi_builder *B = bit_builder(mem_ctx); \ - { \ - bi_builder *b = A; \ - instr; \ - } \ - { \ - bi_builder *b = B; \ - expected; \ - } \ - pass(A->shader); \ - ASSERT_SHADER_EQUAL(A->shader, B->shader); \ -} while(0) +#define INSTRUCTION_CASE(instr, expected, pass) \ + do { \ + bi_builder *A = bit_builder(mem_ctx); \ + bi_builder *B = bit_builder(mem_ctx); \ + { \ + bi_builder *b = A; \ + instr; \ + } \ + { \ + bi_builder *b = B; \ + expected; \ + } \ + pass(A->shader); \ + ASSERT_SHADER_EQUAL(A->shader, B->shader); \ + } while (0) #endif diff --git a/src/panfrost/bifrost/bi_validate.c b/src/panfrost/bifrost/bi_validate.c index 807aff18531..05b9cfa0347 100644 --- a/src/panfrost/bifrost/bi_validate.c +++ b/src/panfrost/bifrost/bi_validate.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "util/u_memory.h" +#include "compiler.h" /* Validatation doesn't make sense in release builds */ #ifndef NDEBUG @@ -35,21 +35,21 @@ bool bi_validate_initialization(bi_context *ctx) { - bool success = true; + bool success = true; - /* Calculate the live set */ - bi_block *entry = bi_entry_block(ctx); - bi_compute_liveness_ssa(ctx); + /* Calculate the live set */ + bi_block *entry = bi_entry_block(ctx); + bi_compute_liveness_ssa(ctx); - /* Validate that the live set is indeed empty */ - for (unsigned i = 0; i < ctx->ssa_alloc; ++i) { - if (BITSET_TEST(entry->ssa_live_in, i)) { - fprintf(stderr, "%u\n", i); - success = false; - } - } + /* Validate that the live set is indeed empty */ + for (unsigned i = 0; i < ctx->ssa_alloc; ++i) { + if (BITSET_TEST(entry->ssa_live_in, i)) { + fprintf(stderr, "%u\n", i); + success = false; + } + } - return success; + return success; } /* @@ -60,47 +60,46 @@ bi_validate_initialization(bi_context *ctx) static bool bi_validate_preload(bi_context *ctx) { - bool start = true; - uint64_t preloaded = 0; + bool start = true; + uint64_t preloaded = 0; - bi_foreach_block(ctx, block) { - bi_foreach_instr_in_block(block, I) { - /* No instruction should have a register destination */ - bi_foreach_dest(I, d) { - if (I->dest[d].type == BI_INDEX_REGISTER) - return false; - } + bi_foreach_block(ctx, block) { + bi_foreach_instr_in_block(block, I) { + /* No instruction should have a register destination */ + bi_foreach_dest(I, d) { + if (I->dest[d].type == BI_INDEX_REGISTER) + return false; + } - /* Preloads are register moves at the start */ - bool is_preload = - start && I->op == BI_OPCODE_MOV_I32 && - I->src[0].type == BI_INDEX_REGISTER; + /* Preloads are register moves at the start */ + bool is_preload = start && I->op == BI_OPCODE_MOV_I32 && + I->src[0].type == BI_INDEX_REGISTER; - /* After the first nonpreload, we're done preloading */ - start &= is_preload; + /* After the first nonpreload, we're done preloading */ + start &= is_preload; - /* Only preloads may have a register source */ - bi_foreach_src(I, s) { - if (I->src[s].type == BI_INDEX_REGISTER && !is_preload) - return false; - } + /* Only preloads may have a register source */ + bi_foreach_src(I, s) { + if (I->src[s].type == BI_INDEX_REGISTER && !is_preload) + return false; + } - /* Check uniqueness */ - if (is_preload) { - unsigned r = I->src[0].value; + /* Check uniqueness */ + if (is_preload) { + unsigned r = I->src[0].value; - if (preloaded & BITFIELD64_BIT(r)) - return false; + if (preloaded & BITFIELD64_BIT(r)) + return false; - preloaded |= BITFIELD64_BIT(r); - } - } + preloaded |= BITFIELD64_BIT(r); + } + } - /* Only the first block may preload */ - start = false; - } + /* Only the first block may preload */ + start = false; + } - return true; + return true; } /* @@ -111,38 +110,37 @@ bi_validate_preload(bi_context *ctx) static bool bi_validate_width(bi_context *ctx) { - bool succ = true; - uint8_t *width = calloc(ctx->ssa_alloc, sizeof(uint8_t)); + bool succ = true; + uint8_t *width = calloc(ctx->ssa_alloc, sizeof(uint8_t)); - bi_foreach_instr_global(ctx, I) { - bi_foreach_dest(I, d) { - assert(bi_is_ssa(I->dest[d])); + bi_foreach_instr_global(ctx, I) { + bi_foreach_dest(I, d) { + assert(bi_is_ssa(I->dest[d])); - unsigned v = I->dest[d].value; - assert(width[v] == 0 && "broken SSA"); + unsigned v = I->dest[d].value; + assert(width[v] == 0 && "broken SSA"); - width[v] = bi_count_write_registers(I, d); - } - } + width[v] = bi_count_write_registers(I, d); + } + } - bi_foreach_instr_global(ctx, I) { - bi_foreach_ssa_src(I, s) { - unsigned v = I->src[s].value; - unsigned n = bi_count_read_registers(I, s); + bi_foreach_instr_global(ctx, I) { + bi_foreach_ssa_src(I, s) { + unsigned v = I->src[s].value; + unsigned n = bi_count_read_registers(I, s); - if (width[v] != n) { - succ = false; - fprintf(stderr, - "source %u, expected width %u, got width %u\n", - s, n, width[v]); - bi_print_instr(I, stderr); - fprintf(stderr, "\n"); - } - } - } + if (width[v] != n) { + succ = false; + fprintf(stderr, "source %u, expected width %u, got width %u\n", s, + n, width[v]); + bi_print_instr(I, stderr); + fprintf(stderr, "\n"); + } + } + } - free(width); - return succ; + free(width); + return succ; } /* @@ -151,20 +149,20 @@ bi_validate_width(bi_context *ctx) static bool bi_validate_dest(bi_context *ctx) { - bool succ = true; + bool succ = true; - bi_foreach_instr_global(ctx, I) { - bi_foreach_dest(I, d) { - if (bi_is_null(I->dest[d])) { - succ = false; - fprintf(stderr, "expected dest %u", d); - bi_print_instr(I, stderr); - fprintf(stderr, "\n"); - } - } - } + bi_foreach_instr_global(ctx, I) { + bi_foreach_dest(I, d) { + if (bi_is_null(I->dest[d])) { + succ = false; + fprintf(stderr, "expected dest %u", d); + bi_print_instr(I, stderr); + fprintf(stderr, "\n"); + } + } + } - return succ; + return succ; } /* @@ -173,57 +171,57 @@ bi_validate_dest(bi_context *ctx) static bool bi_validate_phi_ordering(bi_context *ctx) { - bi_foreach_block(ctx, block) { - bool start = true; + bi_foreach_block(ctx, block) { + bool start = true; - bi_foreach_instr_in_block(block, I) { - if (start) - start = I->op == BI_OPCODE_PHI; - else if (I->op == BI_OPCODE_PHI) - return false; - } - } + bi_foreach_instr_in_block(block, I) { + if (start) + start = I->op == BI_OPCODE_PHI; + else if (I->op == BI_OPCODE_PHI) + return false; + } + } - return true; + return true; } void bi_validate(bi_context *ctx, const char *after) { - bool fail = false; + bool fail = false; - if (bifrost_debug & BIFROST_DBG_NOVALIDATE) - return; + if (bifrost_debug & BIFROST_DBG_NOVALIDATE) + return; - if (!bi_validate_initialization(ctx)) { - fprintf(stderr, "Uninitialized data read after %s\n", after); - fail = true; - } + if (!bi_validate_initialization(ctx)) { + fprintf(stderr, "Uninitialized data read after %s\n", after); + fail = true; + } - if (!bi_validate_preload(ctx)) { - fprintf(stderr, "Unexpected preload after %s\n", after); - fail = true; - } + if (!bi_validate_preload(ctx)) { + fprintf(stderr, "Unexpected preload after %s\n", after); + fail = true; + } - if (!bi_validate_width(ctx)) { - fprintf(stderr, "Unexpected vector with after %s\n", after); - fail = true; - } + if (!bi_validate_width(ctx)) { + fprintf(stderr, "Unexpected vector with after %s\n", after); + fail = true; + } - if (!bi_validate_dest(ctx)) { - fprintf(stderr, "Unexpected source/dest after %s\n", after); - fail = true; - } + if (!bi_validate_dest(ctx)) { + fprintf(stderr, "Unexpected source/dest after %s\n", after); + fail = true; + } - if (!bi_validate_phi_ordering(ctx)) { - fprintf(stderr, "Unexpected phi ordering after %s\n", after); - fail = true; - } + if (!bi_validate_phi_ordering(ctx)) { + fprintf(stderr, "Unexpected phi ordering after %s\n", after); + fail = true; + } - if (fail) { - bi_print_shader(ctx, stderr); - exit(1); - } + if (fail) { + bi_print_shader(ctx, stderr); + exit(1); + } } #endif /* NDEBUG */ diff --git a/src/panfrost/bifrost/bifrost.h b/src/panfrost/bifrost/bifrost.h index b5a9b7e49ab..2fa43f368b1 100644 --- a/src/panfrost/bifrost/bifrost.h +++ b/src/panfrost/bifrost/bifrost.h @@ -26,63 +26,63 @@ #ifndef __bifrost_h__ #define __bifrost_h__ -#include -#include -#include #include +#include +#include +#include #ifdef __cplusplus extern "C" { #endif -#define BIFROST_DBG_MSGS 0x0001 -#define BIFROST_DBG_SHADERS 0x0002 -#define BIFROST_DBG_SHADERDB 0x0004 -#define BIFROST_DBG_VERBOSE 0x0008 -#define BIFROST_DBG_INTERNAL 0x0010 -#define BIFROST_DBG_NOSCHED 0x0020 -#define BIFROST_DBG_INORDER 0x0040 -#define BIFROST_DBG_NOVALIDATE 0x0080 -#define BIFROST_DBG_NOOPT 0x0100 -#define BIFROST_DBG_NOIDVS 0x0200 -#define BIFROST_DBG_NOSB 0x0400 -#define BIFROST_DBG_NOPRELOAD 0x0800 -#define BIFROST_DBG_SPILL 0x1000 -#define BIFROST_DBG_NOPSCHED 0x2000 +#define BIFROST_DBG_MSGS 0x0001 +#define BIFROST_DBG_SHADERS 0x0002 +#define BIFROST_DBG_SHADERDB 0x0004 +#define BIFROST_DBG_VERBOSE 0x0008 +#define BIFROST_DBG_INTERNAL 0x0010 +#define BIFROST_DBG_NOSCHED 0x0020 +#define BIFROST_DBG_INORDER 0x0040 +#define BIFROST_DBG_NOVALIDATE 0x0080 +#define BIFROST_DBG_NOOPT 0x0100 +#define BIFROST_DBG_NOIDVS 0x0200 +#define BIFROST_DBG_NOSB 0x0400 +#define BIFROST_DBG_NOPRELOAD 0x0800 +#define BIFROST_DBG_SPILL 0x1000 +#define BIFROST_DBG_NOPSCHED 0x2000 extern int bifrost_debug; enum bifrost_message_type { - BIFROST_MESSAGE_NONE = 0, - BIFROST_MESSAGE_VARYING = 1, - BIFROST_MESSAGE_ATTRIBUTE = 2, - BIFROST_MESSAGE_TEX = 3, - BIFROST_MESSAGE_VARTEX = 4, - BIFROST_MESSAGE_LOAD = 5, - BIFROST_MESSAGE_STORE = 6, - BIFROST_MESSAGE_ATOMIC = 7, - BIFROST_MESSAGE_BARRIER = 8, - BIFROST_MESSAGE_BLEND = 9, - BIFROST_MESSAGE_TILE = 10, - /* type 11 reserved */ - BIFROST_MESSAGE_Z_STENCIL = 12, - BIFROST_MESSAGE_ATEST = 13, - BIFROST_MESSAGE_JOB = 14, - BIFROST_MESSAGE_64BIT = 15 + BIFROST_MESSAGE_NONE = 0, + BIFROST_MESSAGE_VARYING = 1, + BIFROST_MESSAGE_ATTRIBUTE = 2, + BIFROST_MESSAGE_TEX = 3, + BIFROST_MESSAGE_VARTEX = 4, + BIFROST_MESSAGE_LOAD = 5, + BIFROST_MESSAGE_STORE = 6, + BIFROST_MESSAGE_ATOMIC = 7, + BIFROST_MESSAGE_BARRIER = 8, + BIFROST_MESSAGE_BLEND = 9, + BIFROST_MESSAGE_TILE = 10, + /* type 11 reserved */ + BIFROST_MESSAGE_Z_STENCIL = 12, + BIFROST_MESSAGE_ATEST = 13, + BIFROST_MESSAGE_JOB = 14, + BIFROST_MESSAGE_64BIT = 15 }; enum bifrost_ftz { - BIFROST_FTZ_DISABLE = 0, - BIFROST_FTZ_DX11 = 1, - BIFROST_FTZ_ALWAYS = 2, - BIFROST_FTZ_ABRUPT = 3 + BIFROST_FTZ_DISABLE = 0, + BIFROST_FTZ_DX11 = 1, + BIFROST_FTZ_ALWAYS = 2, + BIFROST_FTZ_ABRUPT = 3 }; enum bifrost_exceptions { - BIFROST_EXCEPTIONS_ENABLED = 0, - BIFROST_EXCEPTIONS_DISABLED = 1, - BIFROST_EXCEPTIONS_PRECISE_DIVISION = 2, - BIFROST_EXCEPTIONS_PRECISE_SQRT = 3, + BIFROST_EXCEPTIONS_ENABLED = 0, + BIFROST_EXCEPTIONS_DISABLED = 1, + BIFROST_EXCEPTIONS_PRECISE_DIVISION = 2, + BIFROST_EXCEPTIONS_PRECISE_SQRT = 3, }; /* Describes clause flow control, with respect to control flow and branch @@ -102,182 +102,182 @@ enum bifrost_exceptions { */ enum bifrost_flow { - /* End-of-shader */ - BIFROST_FLOW_END = 0, + /* End-of-shader */ + BIFROST_FLOW_END = 0, - /* Non back-to-back, PC-encoded reconvergence */ - BIFROST_FLOW_NBTB_PC = 1, + /* Non back-to-back, PC-encoded reconvergence */ + BIFROST_FLOW_NBTB_PC = 1, - /* Non back-to-back, unconditional reconvergence */ - BIFROST_FLOW_NBTB_UNCONDITIONAL = 2, + /* Non back-to-back, unconditional reconvergence */ + BIFROST_FLOW_NBTB_UNCONDITIONAL = 2, - /* Non back-to-back, no reconvergence */ - BIFROST_FLOW_NBTB = 3, + /* Non back-to-back, no reconvergence */ + BIFROST_FLOW_NBTB = 3, - /* Back-to-back, unconditional reconvergence */ - BIFROST_FLOW_BTB_UNCONDITIONAL = 4, + /* Back-to-back, unconditional reconvergence */ + BIFROST_FLOW_BTB_UNCONDITIONAL = 4, - /* Back-to-back, no reconvergence */ - BIFROST_FLOW_BTB_NONE = 5, + /* Back-to-back, no reconvergence */ + BIFROST_FLOW_BTB_NONE = 5, - /* Write elision, unconditional reconvergence */ - BIFROST_FLOW_WE_UNCONDITIONAL = 6, + /* Write elision, unconditional reconvergence */ + BIFROST_FLOW_WE_UNCONDITIONAL = 6, - /* Write elision, no reconvergence */ - BIFROST_FLOW_WE = 7, + /* Write elision, no reconvergence */ + BIFROST_FLOW_WE = 7, }; enum bifrost_slot { - /* 0-5 are general purpose */ - BIFROST_SLOT_ELDEST_DEPTH = 6, - BIFROST_SLOT_ELDEST_COLOUR = 7, + /* 0-5 are general purpose */ + BIFROST_SLOT_ELDEST_DEPTH = 6, + BIFROST_SLOT_ELDEST_COLOUR = 7, }; struct bifrost_header { - /* Reserved */ - unsigned zero1 : 5; + /* Reserved */ + unsigned zero1 : 5; - /* Flush-to-zero mode, leave zero for GL */ - enum bifrost_ftz flush_to_zero : 2; + /* Flush-to-zero mode, leave zero for GL */ + enum bifrost_ftz flush_to_zero : 2; - /* Convert any infinite result of any floating-point operation to the - * biggest representable number */ - unsigned suppress_inf: 1; + /* Convert any infinite result of any floating-point operation to the + * biggest representable number */ + unsigned suppress_inf : 1; - /* Convert NaN to +0.0 */ - unsigned suppress_nan : 1; + /* Convert NaN to +0.0 */ + unsigned suppress_nan : 1; - /* Floating-point excception handling mode */ - enum bifrost_exceptions float_exceptions : 2; + /* Floating-point excception handling mode */ + enum bifrost_exceptions float_exceptions : 2; - /* Enum describing the flow control, which matters for handling - * divergence and reconvergence efficiently */ - enum bifrost_flow flow_control : 3; + /* Enum describing the flow control, which matters for handling + * divergence and reconvergence efficiently */ + enum bifrost_flow flow_control : 3; - /* Reserved */ - unsigned zero2 : 1; + /* Reserved */ + unsigned zero2 : 1; - /* Terminate discarded threads, rather than continuing execution. Set - * for fragment shaders for standard GL behaviour of DISCARD. Also in a - * fragment shader, this disables helper invocations, so cannot be used - * in a shader that requires derivatives or texture LOD computation */ - unsigned terminate_discarded_threads : 1; + /* Terminate discarded threads, rather than continuing execution. Set + * for fragment shaders for standard GL behaviour of DISCARD. Also in a + * fragment shader, this disables helper invocations, so cannot be used + * in a shader that requires derivatives or texture LOD computation */ + unsigned terminate_discarded_threads : 1; - /* If set, the hardware may prefetch the next clause. If false, the - * hardware may not. Clear for unconditional branches. */ - unsigned next_clause_prefetch : 1; + /* If set, the hardware may prefetch the next clause. If false, the + * hardware may not. Clear for unconditional branches. */ + unsigned next_clause_prefetch : 1; - /* If set, a barrier will be inserted after the clause waiting for all - * message passing instructions to read their staging registers, such - * that it is safe for the next clause to write them. */ - unsigned staging_barrier: 1; - unsigned staging_register : 6; + /* If set, a barrier will be inserted after the clause waiting for all + * message passing instructions to read their staging registers, such + * that it is safe for the next clause to write them. */ + unsigned staging_barrier : 1; + unsigned staging_register : 6; - /* Slots to wait on and slot to be used for message passing - * instructions respectively */ - unsigned dependency_wait : 8; - unsigned dependency_slot : 3; + /* Slots to wait on and slot to be used for message passing + * instructions respectively */ + unsigned dependency_wait : 8; + unsigned dependency_slot : 3; - enum bifrost_message_type message_type : 5; - enum bifrost_message_type next_message_type : 5; + enum bifrost_message_type message_type : 5; + enum bifrost_message_type next_message_type : 5; } __attribute__((packed)); enum bifrost_packed_src { - BIFROST_SRC_PORT0 = 0, - BIFROST_SRC_PORT1 = 1, - BIFROST_SRC_PORT2 = 2, - BIFROST_SRC_STAGE = 3, - BIFROST_SRC_FAU_LO = 4, - BIFROST_SRC_FAU_HI = 5, - BIFROST_SRC_PASS_FMA = 6, - BIFROST_SRC_PASS_ADD = 7, + BIFROST_SRC_PORT0 = 0, + BIFROST_SRC_PORT1 = 1, + BIFROST_SRC_PORT2 = 2, + BIFROST_SRC_STAGE = 3, + BIFROST_SRC_FAU_LO = 4, + BIFROST_SRC_FAU_HI = 5, + BIFROST_SRC_PASS_FMA = 6, + BIFROST_SRC_PASS_ADD = 7, }; struct bifrost_fma_inst { - unsigned src0 : 3; - unsigned op : 20; + unsigned src0 : 3; + unsigned op : 20; } __attribute__((packed)); struct bifrost_add_inst { - unsigned src0 : 3; - unsigned op : 17; + unsigned src0 : 3; + unsigned op : 17; } __attribute__((packed)); enum branch_bit_size { - BR_SIZE_32 = 0, - BR_SIZE_16XX = 1, - BR_SIZE_16YY = 2, - // For the above combinations of bitsize and location, an extra bit is - // encoded via comparing the sources. The only possible source of ambiguity - // would be if the sources were the same, but then the branch condition - // would be always true or always false anyways, so we can ignore it. But - // this no longer works when comparing the y component to the x component, - // since it's valid to compare the y component of a source against its own - // x component. Instead, the extra bit is encoded via an extra bitsize. - BR_SIZE_16YX0 = 3, - BR_SIZE_16YX1 = 4, - BR_SIZE_32_AND_16X = 5, - BR_SIZE_32_AND_16Y = 6, - // Used for comparisons with zero and always-true, see below. I think this - // only works for integer comparisons. - BR_SIZE_ZERO = 7, + BR_SIZE_32 = 0, + BR_SIZE_16XX = 1, + BR_SIZE_16YY = 2, + // For the above combinations of bitsize and location, an extra bit is + // encoded via comparing the sources. The only possible source of ambiguity + // would be if the sources were the same, but then the branch condition + // would be always true or always false anyways, so we can ignore it. But + // this no longer works when comparing the y component to the x component, + // since it's valid to compare the y component of a source against its own + // x component. Instead, the extra bit is encoded via an extra bitsize. + BR_SIZE_16YX0 = 3, + BR_SIZE_16YX1 = 4, + BR_SIZE_32_AND_16X = 5, + BR_SIZE_32_AND_16Y = 6, + // Used for comparisons with zero and always-true, see below. I think this + // only works for integer comparisons. + BR_SIZE_ZERO = 7, }; struct bifrost_regs { - unsigned fau_idx : 8; - unsigned reg3 : 6; - unsigned reg2 : 6; - unsigned reg0 : 5; - unsigned reg1 : 6; - unsigned ctrl : 4; + unsigned fau_idx : 8; + unsigned reg3 : 6; + unsigned reg2 : 6; + unsigned reg0 : 5; + unsigned reg1 : 6; + unsigned ctrl : 4; } __attribute__((packed)); -#define BIFROST_FMTC_CONSTANTS 0b0011 -#define BIFROST_FMTC_FINAL 0b0111 +#define BIFROST_FMTC_CONSTANTS 0b0011 +#define BIFROST_FMTC_FINAL 0b0111 struct bifrost_fmt_constant { - unsigned pos : 4; - unsigned tag : 4; - uint64_t imm_1 : 60; - uint64_t imm_2 : 60; + unsigned pos : 4; + unsigned tag : 4; + uint64_t imm_1 : 60; + uint64_t imm_2 : 60; } __attribute__((packed)); /* Clause formats, encoded in a table */ enum bi_clause_subword { - /* Literal 3-bit values */ - BI_CLAUSE_SUBWORD_LITERAL_0 = 0, - /* etc */ - BI_CLAUSE_SUBWORD_LITERAL_7 = 7, + /* Literal 3-bit values */ + BI_CLAUSE_SUBWORD_LITERAL_0 = 0, + /* etc */ + BI_CLAUSE_SUBWORD_LITERAL_7 = 7, - /* The value of the corresponding tuple in the corresponding bits */ - BI_CLAUSE_SUBWORD_TUPLE_0 = 8, - /* etc */ - BI_CLAUSE_SUBWORD_TUPLE_7 = 15, + /* The value of the corresponding tuple in the corresponding bits */ + BI_CLAUSE_SUBWORD_TUPLE_0 = 8, + /* etc */ + BI_CLAUSE_SUBWORD_TUPLE_7 = 15, - /* Clause header */ - BI_CLAUSE_SUBWORD_HEADER = 16, + /* Clause header */ + BI_CLAUSE_SUBWORD_HEADER = 16, - /* Leave zero, but semantically distinct from literal 0 */ - BI_CLAUSE_SUBWORD_RESERVED = 17, + /* Leave zero, but semantically distinct from literal 0 */ + BI_CLAUSE_SUBWORD_RESERVED = 17, - /* Embedded constant 0 */ - BI_CLAUSE_SUBWORD_CONSTANT = 18, + /* Embedded constant 0 */ + BI_CLAUSE_SUBWORD_CONSTANT = 18, - /* M bits controlling modifier for the constant */ - BI_CLAUSE_SUBWORD_M = 19, + /* M bits controlling modifier for the constant */ + BI_CLAUSE_SUBWORD_M = 19, - /* Z bit: 1 to begin encoding constants, 0 to terminate the clause */ - BI_CLAUSE_SUBWORD_Z = 20, + /* Z bit: 1 to begin encoding constants, 0 to terminate the clause */ + BI_CLAUSE_SUBWORD_Z = 20, - /* Upper 3-bits of a given tuple and zero extended */ - BI_CLAUSE_SUBWORD_UPPER_0 = 32, - /* etc */ - BI_CLAUSE_SUBWORD_UPPER_7 = BI_CLAUSE_SUBWORD_UPPER_0 + 7, + /* Upper 3-bits of a given tuple and zero extended */ + BI_CLAUSE_SUBWORD_UPPER_0 = 32, + /* etc */ + BI_CLAUSE_SUBWORD_UPPER_7 = BI_CLAUSE_SUBWORD_UPPER_0 + 7, - /* Upper 3-bits of two tuples, concatenated and zero-extended */ - BI_CLAUSE_SUBWORD_UPPER_23 = BI_CLAUSE_SUBWORD_UPPER_0 + 23, - BI_CLAUSE_SUBWORD_UPPER_56 = BI_CLAUSE_SUBWORD_UPPER_0 + 56, + /* Upper 3-bits of two tuples, concatenated and zero-extended */ + BI_CLAUSE_SUBWORD_UPPER_23 = BI_CLAUSE_SUBWORD_UPPER_0 + 23, + BI_CLAUSE_SUBWORD_UPPER_56 = BI_CLAUSE_SUBWORD_UPPER_0 + 56, }; #define L(x) ((enum bi_clause_subword)(BI_CLAUSE_SUBWORD_LITERAL_0 + x)) @@ -290,15 +290,15 @@ enum bi_clause_subword { #define R BI_CLAUSE_SUBWORD_RESERVED struct bi_clause_format { - unsigned format; /* format number */ - unsigned pos; /* index in the clause */ - enum bi_clause_subword tag_1; /* 2-bits */ - enum bi_clause_subword tag_2; /* 3-bits */ - enum bi_clause_subword tag_3; /* 3-bits */ - enum bi_clause_subword s0_s3; /* 60 bits */ - enum bi_clause_subword s4; /* 15 bits */ - enum bi_clause_subword s5_s6; /* 30 bits */ - enum bi_clause_subword s7; /* 15 bits */ + unsigned format; /* format number */ + unsigned pos; /* index in the clause */ + enum bi_clause_subword tag_1; /* 2-bits */ + enum bi_clause_subword tag_2; /* 3-bits */ + enum bi_clause_subword tag_3; /* 3-bits */ + enum bi_clause_subword s0_s3; /* 60 bits */ + enum bi_clause_subword s4; /* 15 bits */ + enum bi_clause_subword s5_s6; /* 30 bits */ + enum bi_clause_subword s7; /* 15 bits */ }; /* clang-format off */ @@ -341,46 +341,46 @@ static const struct bi_clause_format bi_clause_formats[] = { * set (and ignored) as a placeholder to differentiate from reserved. */ enum bifrost_reg_mode { - BIFROST_R_WL_FMA = 1, - BIFROST_R_WH_FMA = 2, - BIFROST_R_W_FMA = 3, - BIFROST_R_WL_ADD = 4, - BIFROST_R_WH_ADD = 5, - BIFROST_R_W_ADD = 6, - BIFROST_WL_WL_ADD = 7, - BIFROST_WL_WH_ADD = 8, - BIFROST_WL_W_ADD = 9, - BIFROST_WH_WL_ADD = 10, - BIFROST_WH_WH_ADD = 11, - BIFROST_WH_W_ADD = 12, - BIFROST_W_WL_ADD = 13, - BIFROST_W_WH_ADD = 14, - BIFROST_W_W_ADD = 15, - BIFROST_IDLE_1 = 16, - BIFROST_I_W_FMA = 17, - BIFROST_I_WL_FMA = 18, - BIFROST_I_WH_FMA = 19, - BIFROST_R_I = 20, - BIFROST_I_W_ADD = 21, - BIFROST_I_WL_ADD = 22, - BIFROST_I_WH_ADD = 23, - BIFROST_WL_WH_MIX = 24, - BIFROST_WH_WL_MIX = 26, - BIFROST_IDLE = 27, + BIFROST_R_WL_FMA = 1, + BIFROST_R_WH_FMA = 2, + BIFROST_R_W_FMA = 3, + BIFROST_R_WL_ADD = 4, + BIFROST_R_WH_ADD = 5, + BIFROST_R_W_ADD = 6, + BIFROST_WL_WL_ADD = 7, + BIFROST_WL_WH_ADD = 8, + BIFROST_WL_W_ADD = 9, + BIFROST_WH_WL_ADD = 10, + BIFROST_WH_WH_ADD = 11, + BIFROST_WH_W_ADD = 12, + BIFROST_W_WL_ADD = 13, + BIFROST_W_WH_ADD = 14, + BIFROST_W_W_ADD = 15, + BIFROST_IDLE_1 = 16, + BIFROST_I_W_FMA = 17, + BIFROST_I_WL_FMA = 18, + BIFROST_I_WH_FMA = 19, + BIFROST_R_I = 20, + BIFROST_I_W_ADD = 21, + BIFROST_I_WL_ADD = 22, + BIFROST_I_WH_ADD = 23, + BIFROST_WL_WH_MIX = 24, + BIFROST_WH_WL_MIX = 26, + BIFROST_IDLE = 27, }; enum bifrost_reg_op { - BIFROST_OP_IDLE = 0, - BIFROST_OP_READ = 1, - BIFROST_OP_WRITE = 2, - BIFROST_OP_WRITE_LO = 3, - BIFROST_OP_WRITE_HI = 4, + BIFROST_OP_IDLE = 0, + BIFROST_OP_READ = 1, + BIFROST_OP_WRITE = 2, + BIFROST_OP_WRITE_LO = 3, + BIFROST_OP_WRITE_HI = 4, }; struct bifrost_reg_ctrl_23 { - enum bifrost_reg_op slot2; - enum bifrost_reg_op slot3; - bool slot3_fma; + enum bifrost_reg_op slot2; + enum bifrost_reg_op slot3; + bool slot3_fma; }; /* clang-format off */ @@ -420,201 +420,201 @@ static const struct bifrost_reg_ctrl_23 bifrost_reg_ctrl_lut[32] = { * compiler and stored as a constant */ enum bifrost_texture_operation_mode { - /* Dual texturing */ - BIFROST_TEXTURE_OPERATION_DUAL = 1, + /* Dual texturing */ + BIFROST_TEXTURE_OPERATION_DUAL = 1, - /* Single texturing */ - BIFROST_TEXTURE_OPERATION_SINGLE = 3, + /* Single texturing */ + BIFROST_TEXTURE_OPERATION_SINGLE = 3, }; enum bifrost_index { - /* Both texture/sampler index immediate */ - BIFROST_INDEX_IMMEDIATE_SHARED = 0, + /* Both texture/sampler index immediate */ + BIFROST_INDEX_IMMEDIATE_SHARED = 0, - /* Sampler index immediate, texture index from staging */ - BIFROST_INDEX_IMMEDIATE_SAMPLER = 1, + /* Sampler index immediate, texture index from staging */ + BIFROST_INDEX_IMMEDIATE_SAMPLER = 1, - /* Texture index immediate, sampler index from staging */ - BIFROST_INDEX_IMMEDIATE_TEXTURE = 2, + /* Texture index immediate, sampler index from staging */ + BIFROST_INDEX_IMMEDIATE_TEXTURE = 2, - /* Both indices from (separate) staging registers */ - BIFROST_INDEX_REGISTER = 3, + /* Both indices from (separate) staging registers */ + BIFROST_INDEX_REGISTER = 3, }; enum bifrost_tex_op { - /* Given explicit derivatives, compute a gradient descriptor */ - BIFROST_TEX_OP_GRDESC_DER = 4, + /* Given explicit derivatives, compute a gradient descriptor */ + BIFROST_TEX_OP_GRDESC_DER = 4, - /* Given implicit derivatives (texture coordinates in a fragment - * shader), compute a gradient descriptor */ - BIFROST_TEX_OP_GRDESC = 5, + /* Given implicit derivatives (texture coordinates in a fragment + * shader), compute a gradient descriptor */ + BIFROST_TEX_OP_GRDESC = 5, - /* Fetch a texel. Takes a staging register with LOD level / face index - * packed 16:16 */ - BIFROST_TEX_OP_FETCH = 6, + /* Fetch a texel. Takes a staging register with LOD level / face index + * packed 16:16 */ + BIFROST_TEX_OP_FETCH = 6, - /* Filtered texture */ - BIFROST_TEX_OP_TEX = 7, + /* Filtered texture */ + BIFROST_TEX_OP_TEX = 7, }; enum bifrost_lod_mode { - /* Takes two staging registers forming a 64-bit gradient descriptor - * (computed by a previous GRDESC or GRDESC_DER operation) */ - BIFROST_LOD_MODE_GRDESC = 3, + /* Takes two staging registers forming a 64-bit gradient descriptor + * (computed by a previous GRDESC or GRDESC_DER operation) */ + BIFROST_LOD_MODE_GRDESC = 3, - /* Take a staging register with 8:8 fixed-point in bottom 16-bits - * specifying an explicit LOD */ - BIFROST_LOD_MODE_EXPLICIT = 4, + /* Take a staging register with 8:8 fixed-point in bottom 16-bits + * specifying an explicit LOD */ + BIFROST_LOD_MODE_EXPLICIT = 4, - /* Takes a staging register with bottom 16-bits as 8:8 fixed-point LOD - * bias and top 16-bit as 8:8 fixed-point lower bound (generally left - * zero), added and clamped to a computed LOD */ - BIFROST_LOD_MODE_BIAS = 5, + /* Takes a staging register with bottom 16-bits as 8:8 fixed-point LOD + * bias and top 16-bit as 8:8 fixed-point lower bound (generally left + * zero), added and clamped to a computed LOD */ + BIFROST_LOD_MODE_BIAS = 5, - /* Set LOD to zero */ - BIFROST_LOD_MODE_ZERO = 6, + /* Set LOD to zero */ + BIFROST_LOD_MODE_ZERO = 6, - /* Compute LOD */ - BIFROST_LOD_MODE_COMPUTE = 7, + /* Compute LOD */ + BIFROST_LOD_MODE_COMPUTE = 7, }; enum bifrost_texture_format { - /* 16-bit floating point, with optional clamping */ - BIFROST_TEXTURE_FORMAT_F16 = 0, - BIFROST_TEXTURE_FORMAT_F16_POS = 1, - BIFROST_TEXTURE_FORMAT_F16_PM1 = 2, - BIFROST_TEXTURE_FORMAT_F16_1 = 3, + /* 16-bit floating point, with optional clamping */ + BIFROST_TEXTURE_FORMAT_F16 = 0, + BIFROST_TEXTURE_FORMAT_F16_POS = 1, + BIFROST_TEXTURE_FORMAT_F16_PM1 = 2, + BIFROST_TEXTURE_FORMAT_F16_1 = 3, - /* 32-bit floating point, with optional clamping */ - BIFROST_TEXTURE_FORMAT_F32 = 4, - BIFROST_TEXTURE_FORMAT_F32_POS = 5, - BIFROST_TEXTURE_FORMAT_F32_PM1 = 6, - BIFROST_TEXTURE_FORMAT_F32_1 = 7, + /* 32-bit floating point, with optional clamping */ + BIFROST_TEXTURE_FORMAT_F32 = 4, + BIFROST_TEXTURE_FORMAT_F32_POS = 5, + BIFROST_TEXTURE_FORMAT_F32_PM1 = 6, + BIFROST_TEXTURE_FORMAT_F32_1 = 7, }; enum bifrost_texture_format_full { - /* Transclude bifrost_texture_format from above */ + /* Transclude bifrost_texture_format from above */ - /* Integers, unclamped */ - BIFROST_TEXTURE_FORMAT_U16 = 12, - BIFROST_TEXTURE_FORMAT_S16 = 13, - BIFROST_TEXTURE_FORMAT_U32 = 14, - BIFROST_TEXTURE_FORMAT_S32 = 15, + /* Integers, unclamped */ + BIFROST_TEXTURE_FORMAT_U16 = 12, + BIFROST_TEXTURE_FORMAT_S16 = 13, + BIFROST_TEXTURE_FORMAT_U32 = 14, + BIFROST_TEXTURE_FORMAT_S32 = 15, }; enum bifrost_texture_fetch { - /* Default texelFetch */ - BIFROST_TEXTURE_FETCH_TEXEL = 1, + /* Default texelFetch */ + BIFROST_TEXTURE_FETCH_TEXEL = 1, - /* Deprecated, fetches 4x U32 of a U8 x 4 texture. Do not use. */ - BIFROST_TEXTURE_FETCH_GATHER4_RGBA = 3, + /* Deprecated, fetches 4x U32 of a U8 x 4 texture. Do not use. */ + BIFROST_TEXTURE_FETCH_GATHER4_RGBA = 3, - /* Gathers */ - BIFROST_TEXTURE_FETCH_GATHER4_R = 4, - BIFROST_TEXTURE_FETCH_GATHER4_G = 5, - BIFROST_TEXTURE_FETCH_GATHER4_B = 6, - BIFROST_TEXTURE_FETCH_GATHER4_A = 7 + /* Gathers */ + BIFROST_TEXTURE_FETCH_GATHER4_R = 4, + BIFROST_TEXTURE_FETCH_GATHER4_G = 5, + BIFROST_TEXTURE_FETCH_GATHER4_B = 6, + BIFROST_TEXTURE_FETCH_GATHER4_A = 7 }; struct bifrost_texture_operation { - /* If immediate_indices is set: - * - immediate sampler index - * - index used as texture index - * Otherwise: - * - bifrost_single_index in lower 2 bits - * - 0x3 in upper 2 bits (single-texturing) - */ - unsigned sampler_index_or_mode : 4; - unsigned index : 7; - bool immediate_indices : 1; - enum bifrost_tex_op op : 3; + /* If immediate_indices is set: + * - immediate sampler index + * - index used as texture index + * Otherwise: + * - bifrost_single_index in lower 2 bits + * - 0x3 in upper 2 bits (single-texturing) + */ + unsigned sampler_index_or_mode : 4; + unsigned index : 7; + bool immediate_indices : 1; + enum bifrost_tex_op op : 3; - /* If set for TEX/FETCH, loads texel offsets and multisample index from - * a staging register containing offset_x:offset_y:offset_z:ms_index - * packed 8:8:8:8. Offsets must be in [-31, +31]. If set for - * GRDESC(_DER), disable LOD bias. */ - bool offset_or_bias_disable : 1; + /* If set for TEX/FETCH, loads texel offsets and multisample index from + * a staging register containing offset_x:offset_y:offset_z:ms_index + * packed 8:8:8:8. Offsets must be in [-31, +31]. If set for + * GRDESC(_DER), disable LOD bias. */ + bool offset_or_bias_disable : 1; - /* If set for TEX/FETCH, loads fp32 shadow comparison value from a - * staging register. Implies fetch_component = gather4_r. If set for - * GRDESC(_DER), disables LOD clamping. */ - bool shadow_or_clamp_disable : 1; + /* If set for TEX/FETCH, loads fp32 shadow comparison value from a + * staging register. Implies fetch_component = gather4_r. If set for + * GRDESC(_DER), disables LOD clamping. */ + bool shadow_or_clamp_disable : 1; - /* If set, loads an uint32 array index from a staging register. */ - bool array : 1; + /* If set, loads an uint32 array index from a staging register. */ + bool array : 1; - /* Texture dimension, or 0 for a cubemap */ - unsigned dimension : 2; + /* Texture dimension, or 0 for a cubemap */ + unsigned dimension : 2; - /* Method to compute LOD value or for a FETCH, the - * bifrost_texture_fetch component specification */ - enum bifrost_lod_mode lod_or_fetch : 3; + /* Method to compute LOD value or for a FETCH, the + * bifrost_texture_fetch component specification */ + enum bifrost_lod_mode lod_or_fetch : 3; - /* Reserved */ - unsigned zero : 1; + /* Reserved */ + unsigned zero : 1; - /* Register format for the result */ - enum bifrost_texture_format_full format : 4; + /* Register format for the result */ + enum bifrost_texture_format_full format : 4; - /* Write mask for the result */ - unsigned mask : 4; + /* Write mask for the result */ + unsigned mask : 4; } __attribute__((packed)); struct bifrost_dual_texture_operation { - unsigned primary_sampler_index : 2; - unsigned mode : 2; /* 0x1 for dual */ - unsigned primary_texture_index : 2; - unsigned secondary_sampler_index : 2; - unsigned secondary_texture_index : 2; + unsigned primary_sampler_index : 2; + unsigned mode : 2; /* 0x1 for dual */ + unsigned primary_texture_index : 2; + unsigned secondary_sampler_index : 2; + unsigned secondary_texture_index : 2; - /* Leave zero for dual texturing */ - unsigned reserved : 1; - unsigned index_mode_zero : 1; + /* Leave zero for dual texturing */ + unsigned reserved : 1; + unsigned index_mode_zero : 1; - /* Base staging register to write the secondary results to */ - unsigned secondary_register : 6; + /* Base staging register to write the secondary results to */ + unsigned secondary_register : 6; - /* Format/mask for each texture */ - enum bifrost_texture_format secondary_format : 3; - unsigned secondary_mask : 4; + /* Format/mask for each texture */ + enum bifrost_texture_format secondary_format : 3; + unsigned secondary_mask : 4; - enum bifrost_texture_format primary_format : 3; - unsigned primary_mask : 4; + enum bifrost_texture_format primary_format : 3; + unsigned primary_mask : 4; } __attribute__((packed)); static inline uint32_t bi_dual_tex_as_u32(struct bifrost_dual_texture_operation desc) { - uint32_t desc_u; - memcpy(&desc_u, &desc, sizeof(desc)); + uint32_t desc_u; + memcpy(&desc_u, &desc, sizeof(desc)); - return desc_u; + return desc_u; } -#define BIFROST_MEGA_SAMPLE 128 -#define BIFROST_ALL_SAMPLES 255 +#define BIFROST_MEGA_SAMPLE 128 +#define BIFROST_ALL_SAMPLES 255 #define BIFROST_CURRENT_PIXEL 255 struct bifrost_pixel_indices { - unsigned sample : 8; - unsigned rt : 8; - unsigned x : 8; - unsigned y : 8; + unsigned sample : 8; + unsigned rt : 8; + unsigned x : 8; + unsigned y : 8; } __attribute__((packed)); enum bi_constmod { - BI_CONSTMOD_NONE, - BI_CONSTMOD_PC_LO, - BI_CONSTMOD_PC_HI, - BI_CONSTMOD_PC_LO_HI + BI_CONSTMOD_NONE, + BI_CONSTMOD_PC_LO, + BI_CONSTMOD_PC_HI, + BI_CONSTMOD_PC_LO_HI }; struct bi_constants { - /* Raw constant values */ - uint64_t raw[6]; + /* Raw constant values */ + uint64_t raw[6]; - /* Associated modifier derived from M values */ - enum bi_constmod mods[6]; + /* Associated modifier derived from M values */ + enum bi_constmod mods[6]; }; /* FAU selectors for constants are out-of-order, construct the top bits @@ -623,12 +623,10 @@ struct bi_constants { static inline unsigned bi_constant_field(unsigned idx) { - const unsigned values[] = { - 4, 5, 6, 7, 2, 3 - }; + const unsigned values[] = {4, 5, 6, 7, 2, 3}; - assert(idx <= 5); - return values[idx] << 4; + assert(idx <= 5); + return values[idx] << 4; } #ifdef __cplusplus diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 50f0cd37699..1aafda25483 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -26,20 +26,19 @@ */ #include "compiler/glsl/glsl_to_nir.h" -#include "compiler/nir_types.h" #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_schedule.h" +#include "compiler/nir_types.h" #include "util/u_debug.h" -#include "disassemble.h" -#include "valhall/va_compiler.h" #include "valhall/disassemble.h" -#include "bifrost_compile.h" -#include "compiler.h" #include "valhall/va_compiler.h" -#include "bi_quirks.h" #include "bi_builder.h" +#include "bi_quirks.h" +#include "bifrost_compile.h" #include "bifrost_nir.h" +#include "compiler.h" +#include "disassemble.h" /* clang-format off */ static const struct debug_named_value bifrost_debug_options[] = { @@ -61,7 +60,8 @@ static const struct debug_named_value bifrost_debug_options[] = { }; /* clang-format on */ -DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0) +DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", + bifrost_debug_options, 0) /* How many bytes are prefetched by the Bifrost shader core. From the final * clause of the shader, this range must be valid instructions or zero. */ @@ -69,35 +69,36 @@ DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_o int bifrost_debug = 0; -#define DBG(fmt, ...) \ - do { if (bifrost_debug & BIFROST_DBG_MSGS) \ - fprintf(stderr, "%s:%d: "fmt, \ - __func__, __LINE__, ##__VA_ARGS__); } while (0) +#define DBG(fmt, ...) \ + do { \ + if (bifrost_debug & BIFROST_DBG_MSGS) \ + fprintf(stderr, "%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__); \ + } while (0) static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list); static bi_index bi_preload(bi_builder *b, unsigned reg) { - if (bi_is_null(b->shader->preloaded[reg])) { - /* Insert at the beginning of the shader */ - bi_builder b_ = *b; - b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks)); + if (bi_is_null(b->shader->preloaded[reg])) { + /* Insert at the beginning of the shader */ + bi_builder b_ = *b; + b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks)); - /* Cache the result */ - b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg)); - } + /* Cache the result */ + b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg)); + } - return b->shader->preloaded[reg]; + return b->shader->preloaded[reg]; } static bi_index bi_coverage(bi_builder *b) { - if (bi_is_null(b->shader->coverage)) - b->shader->coverage = bi_preload(b, 60); + if (bi_is_null(b->shader->coverage)) + b->shader->coverage = bi_preload(b, 60); - return b->shader->coverage; + return b->shader->coverage; } /* @@ -108,44 +109,44 @@ bi_coverage(bi_builder *b) static inline bi_index bi_vertex_id(bi_builder *b) { - return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61); + return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61); } static inline bi_index bi_instance_id(bi_builder *b) { - return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62); + return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62); } static void bi_emit_jump(bi_builder *b, nir_jump_instr *instr) { - bi_instr *branch = bi_jump(b, bi_zero()); + bi_instr *branch = bi_jump(b, bi_zero()); - switch (instr->type) { - case nir_jump_break: - branch->branch_target = b->shader->break_block; - break; - case nir_jump_continue: - branch->branch_target = b->shader->continue_block; - break; - default: - unreachable("Unhandled jump type"); - } + switch (instr->type) { + case nir_jump_break: + branch->branch_target = b->shader->break_block; + break; + case nir_jump_continue: + branch->branch_target = b->shader->continue_block; + break; + default: + unreachable("Unhandled jump type"); + } - bi_block_add_successor(b->shader->current_block, branch->branch_target); - b->shader->current_block->unconditional_jumps = true; + bi_block_add_successor(b->shader->current_block, branch->branch_target); + b->shader->current_block->unconditional_jumps = true; } /* Builds a 64-bit hash table key for an index */ static uint64_t bi_index_to_key(bi_index idx) { - static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding"); + static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding"); - uint64_t key = 0; - memcpy(&key, &idx, sizeof(idx)); - return key; + uint64_t key = 0; + memcpy(&key, &idx, sizeof(idx)); + return key; } /* @@ -156,32 +157,31 @@ bi_index_to_key(bi_index idx) static bi_index bi_extract(bi_builder *b, bi_index vec, unsigned channel) { - bi_index *components = - _mesa_hash_table_u64_search(b->shader->allocated_vec, - bi_index_to_key(vec)); + bi_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec, + bi_index_to_key(vec)); - /* No extract needed for scalars. - * - * This is a bit imprecise, but actual bugs (missing splits for vectors) - * should be caught by the following assertion. It is too difficult to - * ensure bi_extract is only called for real vectors. - */ - if (components == NULL && channel == 0) - return vec; + /* No extract needed for scalars. + * + * This is a bit imprecise, but actual bugs (missing splits for vectors) + * should be caught by the following assertion. It is too difficult to + * ensure bi_extract is only called for real vectors. + */ + if (components == NULL && channel == 0) + return vec; - assert(components != NULL && "missing bi_cache_collect()"); - return components[channel]; + assert(components != NULL && "missing bi_cache_collect()"); + return components[channel]; } static void bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n) { - /* Lifetime of a hash table entry has to be at least as long as the table */ - bi_index *channels = ralloc_array(b->shader, bi_index, n); - memcpy(channels, s, sizeof(bi_index) * n); + /* Lifetime of a hash table entry has to be at least as long as the table */ + bi_index *channels = ralloc_array(b->shader, bi_index, n); + memcpy(channels, s, sizeof(bi_index) * n); - _mesa_hash_table_u64_insert(b->shader->allocated_vec, - bi_index_to_key(dst), channels); + _mesa_hash_table_u64_insert(b->shader->allocated_vec, bi_index_to_key(dst), + channels); } /* @@ -193,28 +193,28 @@ bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n) static void bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n) { - /* Setup the destinations */ - for (unsigned i = 0; i < n; ++i) { - dests[i] = bi_temp(b->shader); - } + /* Setup the destinations */ + for (unsigned i = 0; i < n; ++i) { + dests[i] = bi_temp(b->shader); + } - /* Emit the split */ - if (n == 1) { - bi_mov_i32_to(b, dests[0], vec); - } else { - bi_instr *I = bi_split_i32_to(b, n, vec); + /* Emit the split */ + if (n == 1) { + bi_mov_i32_to(b, dests[0], vec); + } else { + bi_instr *I = bi_split_i32_to(b, n, vec); - bi_foreach_dest(I, j) - I->dest[j] = dests[j]; - } + bi_foreach_dest(I, j) + I->dest[j] = dests[j]; + } } static void bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n) { - bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; - bi_emit_split_i32(b, dests, vec, n); - bi_cache_collect(b, vec, dests, n); + bi_index dests[4] = {bi_null(), bi_null(), bi_null(), bi_null()}; + bi_emit_split_i32(b, dests, vec, n); + bi_cache_collect(b, vec, dests, n); } /* @@ -224,161 +224,164 @@ bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n) static void bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits) { - bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32)); + bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32)); } static void bi_split_dest(bi_builder *b, nir_dest dest) { - bi_emit_cached_split(b, bi_dest_index(&dest), - nir_dest_bit_size(dest) * - nir_dest_num_components(dest)); + bi_emit_cached_split( + b, bi_dest_index(&dest), + nir_dest_bit_size(dest) * nir_dest_num_components(dest)); } static bi_instr * bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n) { - /* Special case: COLLECT of a single value is a scalar move */ - if (n == 1) - return bi_mov_i32_to(b, dst, chan[0]); + /* Special case: COLLECT of a single value is a scalar move */ + if (n == 1) + return bi_mov_i32_to(b, dst, chan[0]); - bi_instr *I = bi_collect_i32_to(b, dst, n); + bi_instr *I = bi_collect_i32_to(b, dst, n); - bi_foreach_src(I, i) - I->src[i] = chan[i]; + bi_foreach_src(I, i) + I->src[i] = chan[i]; - bi_cache_collect(b, dst, chan, n); - return I; + bi_cache_collect(b, dst, chan, n); + return I; } static bi_instr * bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1) { - return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2); + return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1}, 2); } static bi_instr * -bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2) +bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, + bi_index s2) { - return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3); + return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1, s2}, 3); } static bi_index bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1) { - bi_index dst = bi_temp(b->shader); - bi_collect_v2i32_to(b, dst, s0, s1); - return dst; + bi_index dst = bi_temp(b->shader); + bi_collect_v2i32_to(b, dst, s0, s1); + return dst; } static bi_index bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr) { - switch (intr->intrinsic) { - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - return bi_preload(b, 61); + switch (intr->intrinsic) { + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + return bi_preload(b, 61); - /* Need to put the sample ID in the top 16-bits */ - case nir_intrinsic_load_barycentric_at_sample: - return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false), - bi_half(bi_src_index(&intr->src[0]), false)); + /* Need to put the sample ID in the top 16-bits */ + case nir_intrinsic_load_barycentric_at_sample: + return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false), + bi_half(bi_src_index(&intr->src[0]), false)); - /* Interpret as 8:8 signed fixed point positions in pixels along X and - * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0) - * is the center of the pixel so we first fixup and then convert. For - * fp16 input: - * - * f2i16(((x, y) + (0.5, 0.5)) * 2**8) = - * f2i16((256 * (x, y)) + (128, 128)) = - * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128)) - * - * For fp32 input, that lacks enough precision for MSAA 16x, but the - * idea is the same. FIXME: still doesn't pass - */ - case nir_intrinsic_load_barycentric_at_offset: { - bi_index offset = bi_src_index(&intr->src[0]); - bi_index f16 = bi_null(); - unsigned sz = nir_src_bit_size(intr->src[0]); + /* Interpret as 8:8 signed fixed point positions in pixels along X and + * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0) + * is the center of the pixel so we first fixup and then convert. For + * fp16 input: + * + * f2i16(((x, y) + (0.5, 0.5)) * 2**8) = + * f2i16((256 * (x, y)) + (128, 128)) = + * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128)) + * + * For fp32 input, that lacks enough precision for MSAA 16x, but the + * idea is the same. FIXME: still doesn't pass + */ + case nir_intrinsic_load_barycentric_at_offset: { + bi_index offset = bi_src_index(&intr->src[0]); + bi_index f16 = bi_null(); + unsigned sz = nir_src_bit_size(intr->src[0]); - if (sz == 16) { - f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), - bi_imm_f16(128.0)); - } else { - assert(sz == 32); - bi_index f[2]; - for (unsigned i = 0; i < 2; ++i) { - f[i] = bi_fadd_rscale_f32(b, - bi_extract(b, offset, i), - bi_imm_f32(0.5), bi_imm_u32(8), - BI_SPECIAL_NONE); - } + if (sz == 16) { + f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), bi_imm_f16(128.0)); + } else { + assert(sz == 32); + bi_index f[2]; + for (unsigned i = 0; i < 2; ++i) { + f[i] = + bi_fadd_rscale_f32(b, bi_extract(b, offset, i), bi_imm_f32(0.5), + bi_imm_u32(8), BI_SPECIAL_NONE); + } - f16 = bi_v2f32_to_v2f16(b, f[0], f[1]); - } + f16 = bi_v2f32_to_v2f16(b, f[0], f[1]); + } - return bi_v2f16_to_v2s16(b, f16); - } + return bi_v2f16_to_v2s16(b, f16); + } - case nir_intrinsic_load_barycentric_pixel: - default: - return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b); - } + case nir_intrinsic_load_barycentric_pixel: + default: + return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b); + } } static enum bi_sample bi_interp_for_intrinsic(nir_intrinsic_op op) { - switch (op) { - case nir_intrinsic_load_barycentric_centroid: - return BI_SAMPLE_CENTROID; - case nir_intrinsic_load_barycentric_sample: - case nir_intrinsic_load_barycentric_at_sample: - return BI_SAMPLE_SAMPLE; - case nir_intrinsic_load_barycentric_at_offset: - return BI_SAMPLE_EXPLICIT; - case nir_intrinsic_load_barycentric_pixel: - default: - return BI_SAMPLE_CENTER; - } + switch (op) { + case nir_intrinsic_load_barycentric_centroid: + return BI_SAMPLE_CENTROID; + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + return BI_SAMPLE_SAMPLE; + case nir_intrinsic_load_barycentric_at_offset: + return BI_SAMPLE_EXPLICIT; + case nir_intrinsic_load_barycentric_pixel: + default: + return BI_SAMPLE_CENTER; + } } /* auto, 64-bit omitted */ static enum bi_register_format bi_reg_fmt_for_nir(nir_alu_type T) { - switch (T) { - case nir_type_float16: return BI_REGISTER_FORMAT_F16; - case nir_type_float32: return BI_REGISTER_FORMAT_F32; - case nir_type_int16: return BI_REGISTER_FORMAT_S16; - case nir_type_uint16: return BI_REGISTER_FORMAT_U16; - case nir_type_int32: return BI_REGISTER_FORMAT_S32; - case nir_type_uint32: return BI_REGISTER_FORMAT_U32; - default: unreachable("Invalid type for register format"); - } + switch (T) { + case nir_type_float16: + return BI_REGISTER_FORMAT_F16; + case nir_type_float32: + return BI_REGISTER_FORMAT_F32; + case nir_type_int16: + return BI_REGISTER_FORMAT_S16; + case nir_type_uint16: + return BI_REGISTER_FORMAT_U16; + case nir_type_int32: + return BI_REGISTER_FORMAT_S32; + case nir_type_uint32: + return BI_REGISTER_FORMAT_U32; + default: + unreachable("Invalid type for register format"); + } } /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the * immediate to be used (which applies even if _IMM can't be used) */ static bool -bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max) +bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, + unsigned max) { - nir_src *offset = nir_get_io_offset_src(instr); + nir_src *offset = nir_get_io_offset_src(instr); - if (!nir_src_is_const(*offset)) - return false; + if (!nir_src_is_const(*offset)) + return false; - *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); - return (*immediate) < max; + *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); + return (*immediate) < max; } -static void -bi_make_vec_to(bi_builder *b, bi_index final_dst, - bi_index *src, - unsigned *channel, - unsigned count, - unsigned bitsize); +static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src, + unsigned *channel, unsigned count, unsigned bitsize); /* Bifrost's load instructions lack a component offset despite operating in * terms of vec4 slots. Usually I/O vectorization avoids nonzero components, @@ -388,59 +391,59 @@ bi_make_vec_to(bi_builder *b, bi_index final_dst, static void bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp) { - unsigned component = nir_intrinsic_component(instr); - unsigned nr = instr->num_components; - unsigned total = nr + component; - unsigned bitsize = nir_dest_bit_size(instr->dest); + unsigned component = nir_intrinsic_component(instr); + unsigned nr = instr->num_components; + unsigned total = nr + component; + unsigned bitsize = nir_dest_bit_size(instr->dest); - assert(total <= 4 && "should be vec4"); - bi_emit_cached_split(b, tmp, total * bitsize); + assert(total <= 4 && "should be vec4"); + bi_emit_cached_split(b, tmp, total * bitsize); - if (component == 0) - return; + if (component == 0) + return; - bi_index srcs[] = { tmp, tmp, tmp }; - unsigned channels[] = { component, component + 1, component + 2 }; + bi_index srcs[] = {tmp, tmp, tmp}; + unsigned channels[] = {component, component + 1, component + 2}; - bi_make_vec_to(b, bi_dest_index(&instr->dest), - srcs, channels, nr, nir_dest_bit_size(instr->dest)); + bi_make_vec_to(b, bi_dest_index(&instr->dest), srcs, channels, nr, + nir_dest_bit_size(instr->dest)); } static void bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) { - nir_alu_type T = nir_intrinsic_dest_type(instr); - enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); - nir_src *offset = nir_get_io_offset_src(instr); - unsigned component = nir_intrinsic_component(instr); - enum bi_vecsize vecsize = (instr->num_components + component - 1); - unsigned imm_index = 0; - unsigned base = nir_intrinsic_base(instr); - bool constant = nir_src_is_const(*offset); - bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); - bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); - bi_instr *I; + nir_alu_type T = nir_intrinsic_dest_type(instr); + enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); + nir_src *offset = nir_get_io_offset_src(instr); + unsigned component = nir_intrinsic_component(instr); + enum bi_vecsize vecsize = (instr->num_components + component - 1); + unsigned imm_index = 0; + unsigned base = nir_intrinsic_base(instr); + bool constant = nir_src_is_const(*offset); + bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); + bi_index dest = + (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); + bi_instr *I; - if (immediate) { - I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), - bi_instance_id(b), regfmt, vecsize, - imm_index); - } else { - bi_index idx = bi_src_index(&instr->src[0]); + if (immediate) { + I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), bi_instance_id(b), regfmt, + vecsize, imm_index); + } else { + bi_index idx = bi_src_index(&instr->src[0]); - if (constant) - idx = bi_imm_u32(imm_index); - else if (base != 0) - idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); + if (constant) + idx = bi_imm_u32(imm_index); + else if (base != 0) + idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); - I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), - idx, regfmt, vecsize); - } + I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), idx, + regfmt, vecsize); + } - if (b->shader->arch >= 9) - I->table = PAN_TABLE_ATTRIBUTE; + if (b->shader->arch >= 9) + I->table = PAN_TABLE_ATTRIBUTE; - bi_copy_component(b, instr, dest); + bi_copy_component(b, instr, dest); } /* @@ -452,17 +455,17 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) static unsigned bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr) { - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - uint32_t mask = ctx->inputs->fixed_varying_mask; + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + uint32_t mask = ctx->inputs->fixed_varying_mask; - if (sem.location >= VARYING_SLOT_VAR0) { - unsigned nr_special = util_bitcount(mask); - unsigned general_index = (sem.location - VARYING_SLOT_VAR0); + if (sem.location >= VARYING_SLOT_VAR0) { + unsigned nr_special = util_bitcount(mask); + unsigned general_index = (sem.location - VARYING_SLOT_VAR0); - return 16 * (nr_special + general_index); - } else { - return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location))); - } + return 16 * (nr_special + general_index); + } else { + return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location))); + } } /* @@ -472,290 +475,275 @@ bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr) static unsigned bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr) { - nir_src *src = nir_get_io_offset_src(intr); - assert(nir_src_is_const(*src) && "assumes immediate offset"); + nir_src *src = nir_get_io_offset_src(intr); + assert(nir_src_is_const(*src) && "assumes immediate offset"); - return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16); + return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16); } static void bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) { - enum bi_sample sample = BI_SAMPLE_CENTER; - enum bi_update update = BI_UPDATE_STORE; - enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; - bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input; - bi_index src0 = bi_null(); + enum bi_sample sample = BI_SAMPLE_CENTER; + enum bi_update update = BI_UPDATE_STORE; + enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; + bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input; + bi_index src0 = bi_null(); - unsigned component = nir_intrinsic_component(instr); - enum bi_vecsize vecsize = (instr->num_components + component - 1); - bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); + unsigned component = nir_intrinsic_component(instr); + enum bi_vecsize vecsize = (instr->num_components + component - 1); + bi_index dest = + (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); - unsigned sz = nir_dest_bit_size(instr->dest); + unsigned sz = nir_dest_bit_size(instr->dest); - if (smooth) { - nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]); - assert(parent); + if (smooth) { + nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]); + assert(parent); - sample = bi_interp_for_intrinsic(parent->intrinsic); - src0 = bi_varying_src0_for_barycentric(b, parent); + sample = bi_interp_for_intrinsic(parent->intrinsic); + src0 = bi_varying_src0_for_barycentric(b, parent); - assert(sz == 16 || sz == 32); - regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 - : BI_REGISTER_FORMAT_F32; - } else { - assert(sz == 32); - regfmt = BI_REGISTER_FORMAT_U32; + assert(sz == 16 || sz == 32); + regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 : BI_REGISTER_FORMAT_F32; + } else { + assert(sz == 32); + regfmt = BI_REGISTER_FORMAT_U32; - /* Valhall can't have bi_null() here, although the source is - * logically unused for flat varyings - */ - if (b->shader->arch >= 9) - src0 = bi_preload(b, 61); + /* Valhall can't have bi_null() here, although the source is + * logically unused for flat varyings + */ + if (b->shader->arch >= 9) + src0 = bi_preload(b, 61); - /* Gather info as we go */ - b->shader->info.bifrost->uses_flat_shading = true; - } + /* Gather info as we go */ + b->shader->info.bifrost->uses_flat_shading = true; + } - enum bi_source_format source_format = - smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32; + enum bi_source_format source_format = + smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32; - nir_src *offset = nir_get_io_offset_src(instr); - unsigned imm_index = 0; - bool immediate = bi_is_intr_immediate(instr, &imm_index, 20); - bi_instr *I = NULL; + nir_src *offset = nir_get_io_offset_src(instr); + unsigned imm_index = 0; + bool immediate = bi_is_intr_immediate(instr, &imm_index, 20); + bi_instr *I = NULL; - if (b->shader->malloc_idvs && immediate) { - /* Immediate index given in bytes. */ - bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, - sample, source_format, update, vecsize, - bi_varying_offset(b->shader, instr)); - } else if (immediate && smooth) { - I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, - vecsize, imm_index); - } else if (immediate && !smooth) { - I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, - vecsize, imm_index); - } else { - bi_index idx = bi_src_index(offset); - unsigned base = nir_intrinsic_base(instr); + if (b->shader->malloc_idvs && immediate) { + /* Immediate index given in bytes. */ + bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format, + update, vecsize, + bi_varying_offset(b->shader, instr)); + } else if (immediate && smooth) { + I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize, + imm_index); + } else if (immediate && !smooth) { + I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize, + imm_index); + } else { + bi_index idx = bi_src_index(offset); + unsigned base = nir_intrinsic_base(instr); - if (b->shader->malloc_idvs) { - /* Index needs to be in bytes, but NIR gives the index - * in slots. For now assume 16 bytes per element. - */ - bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4)); - unsigned vbase = bi_varying_base_bytes(b->shader, instr); + if (b->shader->malloc_idvs) { + /* Index needs to be in bytes, but NIR gives the index + * in slots. For now assume 16 bytes per element. + */ + bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4)); + unsigned vbase = bi_varying_base_bytes(b->shader, instr); - if (vbase != 0) - idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false); + if (vbase != 0) + idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false); - bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, - sample, source_format, update, - vecsize); - } else if (smooth) { - if (base != 0) - idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); + bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample, + source_format, update, vecsize); + } else if (smooth) { + if (base != 0) + idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); - I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample, - update, vecsize); - } else { - if (base != 0) - idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); + I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize); + } else { + if (base != 0) + idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); - I = bi_ld_var_flat_to(b, dest, idx, - BI_FUNCTION_NONE, regfmt, - vecsize); - } - } + I = bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, regfmt, vecsize); + } + } - /* Valhall usually uses machine-allocated IDVS. If this is disabled, use - * a simple Midgard-style ABI. - */ - if (b->shader->arch >= 9 && I != NULL) - I->table = PAN_TABLE_ATTRIBUTE; + /* Valhall usually uses machine-allocated IDVS. If this is disabled, use + * a simple Midgard-style ABI. + */ + if (b->shader->arch >= 9 && I != NULL) + I->table = PAN_TABLE_ATTRIBUTE; - bi_copy_component(b, instr, dest); + bi_copy_component(b, instr, dest); } static bi_index -bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel, unsigned count) +bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel, + unsigned count) { - assert(1 <= count && count <= 4); + assert(1 <= count && count <= 4); - bi_index bytes[4] = { - bi_imm_u8(0), - bi_imm_u8(0), - bi_imm_u8(0), - bi_imm_u8(0) - }; + bi_index bytes[4] = {bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0)}; - for (unsigned i = 0; i < count; ++i) { - unsigned chan = channel ? channel[i] : 0; + for (unsigned i = 0; i < count; ++i) { + unsigned chan = channel ? channel[i] : 0; - bytes[i] = bi_byte(bi_extract(b, src[i], chan >> 2), chan & 3); - } + bytes[i] = bi_byte(bi_extract(b, src[i], chan >> 2), chan & 3); + } - if (b->shader->arch >= 9) { - bi_index vec = bi_zero(); + if (b->shader->arch >= 9) { + bi_index vec = bi_zero(); - if (count >= 3) - vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec); + if (count >= 3) + vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec); - return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec); - } else { - return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]); - } + return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec); + } else { + return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]); + } } static bi_index -bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel, unsigned count) +bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel, + unsigned count) { - unsigned chan0 = channel ? channel[0] : 0; - bi_index w0 = bi_extract(b, src[0], chan0 >> 1); - bi_index h0 = bi_half(w0, chan0 & 1); + unsigned chan0 = channel ? channel[0] : 0; + bi_index w0 = bi_extract(b, src[0], chan0 >> 1); + bi_index h0 = bi_half(w0, chan0 & 1); - /* Zero extend */ - if (count == 1) - return bi_mkvec_v2i16(b, h0, bi_imm_u16(0)); + /* Zero extend */ + if (count == 1) + return bi_mkvec_v2i16(b, h0, bi_imm_u16(0)); - /* Else, create a vector */ - assert(count == 2); + /* Else, create a vector */ + assert(count == 2); - unsigned chan1 = channel ? channel[1] : 0; - bi_index w1 = bi_extract(b, src[1], chan1 >> 1); - bi_index h1 = bi_half(w1, chan1 & 1); + unsigned chan1 = channel ? channel[1] : 0; + bi_index w1 = bi_extract(b, src[1], chan1 >> 1); + bi_index h1 = bi_half(w1, chan1 & 1); - if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1)) - return bi_mov_i32(b, w0); - else if (bi_is_word_equiv(w0, w1)) - return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1)); - else - return bi_mkvec_v2i16(b, h0, h1); + if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1)) + return bi_mov_i32(b, w0); + else if (bi_is_word_equiv(w0, w1)) + return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1)); + else + return bi_mkvec_v2i16(b, h0, h1); } static void -bi_make_vec_to(bi_builder *b, bi_index dst, - bi_index *src, - unsigned *channel, - unsigned count, - unsigned bitsize) +bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel, + unsigned count, unsigned bitsize) { - assert(bitsize == 8 || bitsize == 16 || bitsize == 32); - unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; - unsigned chan_per_word = 1 << shift; + assert(bitsize == 8 || bitsize == 16 || bitsize == 32); + unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; + unsigned chan_per_word = 1 << shift; - assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS && - "unnecessarily large vector should have been lowered"); + assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS && + "unnecessarily large vector should have been lowered"); - bi_index srcs[BI_MAX_VEC]; + bi_index srcs[BI_MAX_VEC]; - for (unsigned i = 0; i < count; i += chan_per_word) { - unsigned rem = MIN2(count - i, chan_per_word); - unsigned *channel_offset = channel ? (channel + i) : NULL; + for (unsigned i = 0; i < count; i += chan_per_word) { + unsigned rem = MIN2(count - i, chan_per_word); + unsigned *channel_offset = channel ? (channel + i) : NULL; - if (bitsize == 32) - srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0); - else if (bitsize == 16) - srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem); - else - srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem); - } + if (bitsize == 32) + srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0); + else if (bitsize == 16) + srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem); + else + srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem); + } - bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word)); + bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word)); } static inline bi_instr * bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0, - bi_index src1) + bi_index src1) { - bi_instr *I; + bi_instr *I; - if (b->shader->arch >= 9) { - I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1); - I->seg = BI_SEG_UBO; - } else { - I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0); - } + if (b->shader->arch >= 9) { + I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1); + I->seg = BI_SEG_UBO; + } else { + I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0); + } - bi_emit_cached_split(b, dest0, bitsize); - return I; + bi_emit_cached_split(b, dest0, bitsize); + return I; } static bi_instr * bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, - unsigned nr_components, unsigned offset) + unsigned nr_components, unsigned offset) { - unsigned sysval_ubo = b->shader->inputs->fixed_sysval_ubo >= 0 ? - b->shader->inputs->fixed_sysval_ubo : - b->shader->nir->info.num_ubos; - unsigned uniform = - pan_lookup_sysval(b->shader->sysval_to_id, - b->shader->info.sysvals, - sysval); - unsigned idx = (uniform * 16) + offset; + unsigned sysval_ubo = b->shader->inputs->fixed_sysval_ubo >= 0 + ? b->shader->inputs->fixed_sysval_ubo + : b->shader->nir->info.num_ubos; + unsigned uniform = pan_lookup_sysval(b->shader->sysval_to_id, + b->shader->info.sysvals, sysval); + unsigned idx = (uniform * 16) + offset; - return bi_load_ubo_to(b, nr_components * 32, dest, - bi_imm_u32(idx), bi_imm_u32(sysval_ubo)); + return bi_load_ubo_to(b, nr_components * 32, dest, bi_imm_u32(idx), + bi_imm_u32(sysval_ubo)); } static void bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr, - unsigned nr_components, unsigned offset) + unsigned nr_components, unsigned offset) { - bi_load_sysval_to(b, bi_dest_index(&intr->dest), - panfrost_sysval_for_instr(&intr->instr, NULL), - nr_components, offset); + bi_load_sysval_to(b, bi_dest_index(&intr->dest), + panfrost_sysval_for_instr(&intr->instr, NULL), + nr_components, offset); } static bi_index -bi_load_sysval(bi_builder *b, int sysval, - unsigned nr_components, unsigned offset) +bi_load_sysval(bi_builder *b, int sysval, unsigned nr_components, + unsigned offset) { - bi_index tmp = bi_temp(b->shader); - bi_load_sysval_to(b, tmp, sysval, nr_components, offset); - return tmp; + bi_index tmp = bi_temp(b->shader); + bi_load_sysval_to(b, tmp, sysval, nr_components, offset); + return tmp; } static void bi_load_sample_id_to(bi_builder *b, bi_index dst) { - /* r61[16:23] contains the sampleID, mask it out. Upper bits - * seem to read garbage (despite being architecturally defined - * as zero), so use a 5-bit mask instead of 8-bits */ + /* r61[16:23] contains the sampleID, mask it out. Upper bits + * seem to read garbage (despite being architecturally defined + * as zero), so use a 5-bit mask instead of 8-bits */ - bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f), - bi_imm_u8(16), false); + bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f), + bi_imm_u8(16), false); } static bi_index bi_load_sample_id(bi_builder *b) { - bi_index sample_id = bi_temp(b->shader); - bi_load_sample_id_to(b, sample_id); - return sample_id; + bi_index sample_id = bi_temp(b->shader); + bi_load_sample_id_to(b, sample_id); + return sample_id; } static bi_index bi_pixel_indices(bi_builder *b, unsigned rt) { - /* We want to load the current pixel. */ - struct bifrost_pixel_indices pix = { - .y = BIFROST_CURRENT_PIXEL, - .rt = rt - }; + /* We want to load the current pixel. */ + struct bifrost_pixel_indices pix = {.y = BIFROST_CURRENT_PIXEL, .rt = rt}; - uint32_t indices_u32 = 0; - memcpy(&indices_u32, &pix, sizeof(indices_u32)); - bi_index indices = bi_imm_u32(indices_u32); + uint32_t indices_u32 = 0; + memcpy(&indices_u32, &pix, sizeof(indices_u32)); + bi_index indices = bi_imm_u32(indices_u32); - /* Sample index above is left as zero. For multisampling, we need to - * fill in the actual sample ID in the lower byte */ + /* Sample index above is left as zero. For multisampling, we need to + * fill in the actual sample ID in the lower byte */ - if (b->shader->inputs->blend.nr_samples > 1) - indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false); + if (b->shader->inputs->blend.nr_samples > 1) + indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false); - return indices; + return indices; } /* Source color is passed through r0-r3, or r4-r7 for the second source when @@ -764,68 +752,64 @@ bi_pixel_indices(bi_builder *b, unsigned rt) static void bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr) { - nir_io_semantics sem = nir_intrinsic_io_semantics(instr); - unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0; - unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr)); - assert(size == 16 || size == 32); + nir_io_semantics sem = nir_intrinsic_io_semantics(instr); + unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0; + unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr)); + assert(size == 16 || size == 32); - bi_index srcs[] = { - bi_preload(b, base + 0), bi_preload(b, base + 1), - bi_preload(b, base + 2), bi_preload(b, base + 3) - }; + bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1), + bi_preload(b, base + 2), bi_preload(b, base + 3)}; - bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2); + bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2); } static void -bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, - bi_index rgba2, nir_alu_type T2, unsigned rt) +bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2, + nir_alu_type T2, unsigned rt) { - /* Reads 2 or 4 staging registers to cover the input */ - unsigned size = nir_alu_type_get_type_size(T); - unsigned size_2 = nir_alu_type_get_type_size(T2); - unsigned sr_count = (size <= 16) ? 2 : 4; - unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4; - const struct panfrost_compile_inputs *inputs = b->shader->inputs; - uint64_t blend_desc = inputs->blend.bifrost_blend_desc; - enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); + /* Reads 2 or 4 staging registers to cover the input */ + unsigned size = nir_alu_type_get_type_size(T); + unsigned size_2 = nir_alu_type_get_type_size(T2); + unsigned sr_count = (size <= 16) ? 2 : 4; + unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4; + const struct panfrost_compile_inputs *inputs = b->shader->inputs; + uint64_t blend_desc = inputs->blend.bifrost_blend_desc; + enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); - /* Workaround for NIR-to-TGSI */ - if (b->shader->nir->info.fs.untyped_color_outputs) - regfmt = BI_REGISTER_FORMAT_AUTO; + /* Workaround for NIR-to-TGSI */ + if (b->shader->nir->info.fs.untyped_color_outputs) + regfmt = BI_REGISTER_FORMAT_AUTO; - if (inputs->is_blend && inputs->blend.nr_samples > 1) { - /* Conversion descriptor comes from the compile inputs, pixel - * indices derived at run time based on sample ID */ - bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b), - bi_imm_u32(blend_desc >> 32), - regfmt, BI_VECSIZE_V4); - } else if (b->shader->inputs->is_blend) { - uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc; + if (inputs->is_blend && inputs->blend.nr_samples > 1) { + /* Conversion descriptor comes from the compile inputs, pixel + * indices derived at run time based on sample ID */ + bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b), + bi_imm_u32(blend_desc >> 32), regfmt, BI_VECSIZE_V4); + } else if (b->shader->inputs->is_blend) { + uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc; - /* Blend descriptor comes from the compile inputs */ - /* Put the result in r0 */ + /* Blend descriptor comes from the compile inputs */ + /* Put the result in r0 */ - bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b), - bi_imm_u32(blend_desc), - bi_imm_u32(blend_desc >> 32), - bi_null(), regfmt, sr_count, 0); - } else { - /* Blend descriptor comes from the FAU RAM. By convention, the - * return address on Bifrost is stored in r48 and will be used - * by the blend shader to jump back to the fragment shader */ + bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b), + bi_imm_u32(blend_desc), bi_imm_u32(blend_desc >> 32), + bi_null(), regfmt, sr_count, 0); + } else { + /* Blend descriptor comes from the FAU RAM. By convention, the + * return address on Bifrost is stored in r48 and will be used + * by the blend shader to jump back to the fragment shader */ - bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b), - bi_fau(BIR_FAU_BLEND_0 + rt, false), - bi_fau(BIR_FAU_BLEND_0 + rt, true), - rgba2, regfmt, sr_count, sr_count_2); - } + bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b), + bi_fau(BIR_FAU_BLEND_0 + rt, false), + bi_fau(BIR_FAU_BLEND_0 + rt, true), rgba2, regfmt, sr_count, + sr_count_2); + } - assert(rt < 8); - b->shader->info.bifrost->blend[rt].type = T; + assert(rt < 8); + b->shader->info.bifrost->blend[rt].type = T; - if (T2) - b->shader->info.bifrost->blend_src1_type = T2; + if (T2) + b->shader->info.bifrost->blend_src1_type = T2; } /* Blend shaders do not need to run ATEST since they are dependent on a @@ -838,116 +822,115 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, static bool bi_skip_atest(bi_context *ctx, bool emit_zs) { - return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend; + return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend; } static void bi_emit_atest(bi_builder *b, bi_index alpha) { - b->shader->coverage = bi_atest(b, bi_coverage(b), alpha, - bi_fau(BIR_FAU_ATEST_PARAM, false)); - b->shader->emitted_atest = true; + b->shader->coverage = + bi_atest(b, bi_coverage(b), alpha, bi_fau(BIR_FAU_ATEST_PARAM, false)); + b->shader->emitted_atest = true; } static void bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr) { - bool combined = instr->intrinsic == - nir_intrinsic_store_combined_output_pan; + bool combined = instr->intrinsic == nir_intrinsic_store_combined_output_pan; - unsigned writeout = combined ? nir_intrinsic_component(instr) : - PAN_WRITEOUT_C; + unsigned writeout = + combined ? nir_intrinsic_component(instr) : PAN_WRITEOUT_C; - bool emit_blend = writeout & (PAN_WRITEOUT_C); - bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S); + bool emit_blend = writeout & (PAN_WRITEOUT_C); + bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S); - unsigned loc = nir_intrinsic_io_semantics(instr).location; - bi_index src0 = bi_src_index(&instr->src[0]); + unsigned loc = nir_intrinsic_io_semantics(instr).location; + bi_index src0 = bi_src_index(&instr->src[0]); - /* By ISA convention, the coverage mask is stored in R60. The store - * itself will be handled by a subsequent ATEST instruction */ - if (loc == FRAG_RESULT_SAMPLE_MASK) { - bi_index orig = bi_coverage(b); - bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0); - bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0)); + /* By ISA convention, the coverage mask is stored in R60. The store + * itself will be handled by a subsequent ATEST instruction */ + if (loc == FRAG_RESULT_SAMPLE_MASK) { + bi_index orig = bi_coverage(b); + bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0); + bi_index new = + bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0)); - b->shader->coverage = - bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO); - return; - } + b->shader->coverage = bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO); + return; + } - /* Emit ATEST if we have to, note ATEST requires a floating-point alpha - * value, but render target #0 might not be floating point. However the - * alpha value is only used for alpha-to-coverage, a stage which is - * skipped for pure integer framebuffers, so the issue is moot. */ + /* Emit ATEST if we have to, note ATEST requires a floating-point alpha + * value, but render target #0 might not be floating point. However the + * alpha value is only used for alpha-to-coverage, a stage which is + * skipped for pure integer framebuffers, so the issue is moot. */ - if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) { - nir_alu_type T = nir_intrinsic_src_type(instr); + if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) { + nir_alu_type T = nir_intrinsic_src_type(instr); - bi_index rgba = bi_src_index(&instr->src[0]); - bi_index alpha = - (T == nir_type_float16) ? bi_half(bi_extract(b, rgba, 1), true) : - (T == nir_type_float32) ? bi_extract(b, rgba, 3) : - bi_dontcare(b); + bi_index rgba = bi_src_index(&instr->src[0]); + bi_index alpha = (T == nir_type_float16) + ? bi_half(bi_extract(b, rgba, 1), true) + : (T == nir_type_float32) ? bi_extract(b, rgba, 3) + : bi_dontcare(b); - /* Don't read out-of-bounds */ - if (nir_src_num_components(instr->src[0]) < 4) - alpha = bi_imm_f32(1.0); + /* Don't read out-of-bounds */ + if (nir_src_num_components(instr->src[0]) < 4) + alpha = bi_imm_f32(1.0); - bi_emit_atest(b, alpha); - } + bi_emit_atest(b, alpha); + } - if (emit_zs) { - bi_index z = bi_dontcare(b), s = bi_dontcare(b); + if (emit_zs) { + bi_index z = bi_dontcare(b), s = bi_dontcare(b); - if (writeout & PAN_WRITEOUT_Z) - z = bi_src_index(&instr->src[2]); + if (writeout & PAN_WRITEOUT_Z) + z = bi_src_index(&instr->src[2]); - if (writeout & PAN_WRITEOUT_S) - s = bi_src_index(&instr->src[3]); + if (writeout & PAN_WRITEOUT_S) + s = bi_src_index(&instr->src[3]); - b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b), - writeout & PAN_WRITEOUT_S, - writeout & PAN_WRITEOUT_Z); - } + b->shader->coverage = + bi_zs_emit(b, z, s, bi_coverage(b), writeout & PAN_WRITEOUT_S, + writeout & PAN_WRITEOUT_Z); + } - if (emit_blend) { - unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0; - bool dual = (writeout & PAN_WRITEOUT_2); - bi_index color = bi_src_index(&instr->src[0]); - bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null(); - nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0; + if (emit_blend) { + unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0; + bool dual = (writeout & PAN_WRITEOUT_2); + bi_index color = bi_src_index(&instr->src[0]); + bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null(); + nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0; - /* Explicit copy since BLEND inputs are precoloured to R0-R3, - * TODO: maybe schedule around this or implement in RA as a - * spill */ - bool has_mrt = (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1); + /* Explicit copy since BLEND inputs are precoloured to R0-R3, + * TODO: maybe schedule around this or implement in RA as a + * spill */ + bool has_mrt = + (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1); - if (has_mrt) { - bi_index srcs[4] = { color, color, color, color }; - unsigned channels[4] = { 0, 1, 2, 3 }; - color = bi_temp(b->shader); - bi_make_vec_to(b, color, srcs, channels, - nir_src_num_components(instr->src[0]), - nir_alu_type_get_type_size(nir_intrinsic_src_type(instr))); - } + if (has_mrt) { + bi_index srcs[4] = {color, color, color, color}; + unsigned channels[4] = {0, 1, 2, 3}; + color = bi_temp(b->shader); + bi_make_vec_to( + b, color, srcs, channels, nir_src_num_components(instr->src[0]), + nir_alu_type_get_type_size(nir_intrinsic_src_type(instr))); + } - bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), - color2, T2, rt); - } + bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), color2, T2, rt); + } - if (b->shader->inputs->is_blend) { - /* Jump back to the fragment shader, return address is stored - * in r48 (see above). On Valhall, only jump if the address is - * nonzero. The check is free there and it implements the "jump - * to 0 terminates the blend shader" that's automatic on - * Bifrost. - */ - if (b->shader->arch >= 8) - bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE); - else - bi_jump(b, bi_preload(b, 48)); - } + if (b->shader->inputs->is_blend) { + /* Jump back to the fragment shader, return address is stored + * in r48 (see above). On Valhall, only jump if the address is + * nonzero. The check is free there and it implements the "jump + * to 0 terminates the blend shader" that's automatic on + * Bifrost. + */ + if (b->shader->arch >= 8) + bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE); + else + bi_jump(b, bi_preload(b, 48)); + } } /** @@ -958,315 +941,311 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr) static bool bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs) { - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - switch (sem.location) { - case VARYING_SLOT_POS: - case VARYING_SLOT_PSIZ: - return idvs == BI_IDVS_VARYING; - default: - return idvs == BI_IDVS_POSITION; - } + switch (sem.location) { + case VARYING_SLOT_POS: + case VARYING_SLOT_PSIZ: + return idvs == BI_IDVS_VARYING; + default: + return idvs == BI_IDVS_POSITION; + } } static bool bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data) { - enum bi_idvs_mode *idvs = data; + enum bi_idvs_mode *idvs = data; - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - return false; + if (intr->intrinsic != nir_intrinsic_store_output) + return false; - if (bi_should_remove_store(intr, *idvs)) { - nir_instr_remove(instr); - return true; - } + if (bi_should_remove_store(intr, *idvs)) { + nir_instr_remove(instr); + return true; + } - return false; + return false; } static void bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) { - /* In principle we can do better for 16-bit. At the moment we require - * 32-bit to permit the use of .auto, in order to force .u32 for flat - * varyings, to handle internal TGSI shaders that set flat in the VS - * but smooth in the FS */ + /* In principle we can do better for 16-bit. At the moment we require + * 32-bit to permit the use of .auto, in order to force .u32 for flat + * varyings, to handle internal TGSI shaders that set flat in the VS + * but smooth in the FS */ - ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr); - ASSERTED unsigned T_size = nir_alu_type_get_type_size(T); - assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16)); - enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; + ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr); + ASSERTED unsigned T_size = nir_alu_type_get_type_size(T); + assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16)); + enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; - unsigned imm_index = 0; - bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); + unsigned imm_index = 0; + bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); - /* Only look at the total components needed. In effect, we fill in all - * the intermediate "holes" in the write mask, since we can't mask off - * stores. Since nir_lower_io_to_temporaries ensures each varying is - * written at most once, anything that's masked out is undefined, so it - * doesn't matter what we write there. So we may as well do the - * simplest thing possible. */ - unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr)); - assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0)); + /* Only look at the total components needed. In effect, we fill in all + * the intermediate "holes" in the write mask, since we can't mask off + * stores. Since nir_lower_io_to_temporaries ensures each varying is + * written at most once, anything that's masked out is undefined, so it + * doesn't matter what we write there. So we may as well do the + * simplest thing possible. */ + unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr)); + assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0)); - bi_index data = bi_src_index(&instr->src[0]); + bi_index data = bi_src_index(&instr->src[0]); - /* To keep the vector dimensions consistent, we need to drop some - * components. This should be coalesced. - * - * TODO: This is ugly and maybe inefficient. Would we rather - * introduce a TRIM.i32 pseudoinstruction? - */ - if (nr < nir_intrinsic_src_components(instr, 0)) { - assert(T_size == 32 && "todo: 16-bit trim"); + /* To keep the vector dimensions consistent, we need to drop some + * components. This should be coalesced. + * + * TODO: This is ugly and maybe inefficient. Would we rather + * introduce a TRIM.i32 pseudoinstruction? + */ + if (nr < nir_intrinsic_src_components(instr, 0)) { + assert(T_size == 32 && "todo: 16-bit trim"); - bi_index chans[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; - unsigned src_comps = nir_intrinsic_src_components(instr, 0); + bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()}; + unsigned src_comps = nir_intrinsic_src_components(instr, 0); - bi_emit_split_i32(b, chans, data, src_comps); + bi_emit_split_i32(b, chans, data, src_comps); - bi_index tmp = bi_temp(b->shader); - bi_instr *collect = bi_collect_i32_to(b, tmp, nr); + bi_index tmp = bi_temp(b->shader); + bi_instr *collect = bi_collect_i32_to(b, tmp, nr); - bi_foreach_src(collect, w) - collect->src[w] = chans[w]; + bi_foreach_src(collect, w) + collect->src[w] = chans[w]; - data = tmp; - } + data = tmp; + } - bool psiz = (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ); + bool psiz = + (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ); - bi_index a[4] = { bi_null() }; + bi_index a[4] = {bi_null()}; - if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) { - /* Bifrost position shaders have a fast path */ - assert(T == nir_type_float16 || T == nir_type_float32); - unsigned regfmt = (T == nir_type_float16) ? 0 : 1; - unsigned identity = (b->shader->arch == 6) ? 0x688 : 0; - unsigned snap4 = 0x5E; - uint32_t format = identity | (snap4 << 12) | (regfmt << 24); + if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) { + /* Bifrost position shaders have a fast path */ + assert(T == nir_type_float16 || T == nir_type_float32); + unsigned regfmt = (T == nir_type_float16) ? 0 : 1; + unsigned identity = (b->shader->arch == 6) ? 0x688 : 0; + unsigned snap4 = 0x5E; + uint32_t format = identity | (snap4 << 12) | (regfmt << 24); - bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59), - bi_imm_u32(format), regfmt, nr - 1); - } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) { - bi_index index = bi_preload(b, 59); + bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59), + bi_imm_u32(format), regfmt, nr - 1); + } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) { + bi_index index = bi_preload(b, 59); - if (psiz) { - assert(T_size == 16 && "should've been lowered"); - index = bi_iadd_imm_i32(b, index, 4); - } + if (psiz) { + assert(T_size == 16 && "should've been lowered"); + index = bi_iadd_imm_i32(b, index, 4); + } - bi_index address = bi_lea_buf_imm(b, index); - bi_emit_split_i32(b, a, address, 2); + bi_index address = bi_lea_buf_imm(b, index); + bi_emit_split_i32(b, a, address, 2); - bool varying = (b->shader->idvs == BI_IDVS_VARYING); + bool varying = (b->shader->idvs == BI_IDVS_VARYING); - bi_store(b, nr * nir_src_bit_size(instr->src[0]), - data, a[0], a[1], - varying ? BI_SEG_VARY : BI_SEG_POS, - varying ? bi_varying_offset(b->shader, instr) : 0); - } else if (immediate) { - bi_index address = bi_lea_attr_imm(b, - bi_vertex_id(b), bi_instance_id(b), - regfmt, imm_index); - bi_emit_split_i32(b, a, address, 3); + bi_store(b, nr * nir_src_bit_size(instr->src[0]), data, a[0], a[1], + varying ? BI_SEG_VARY : BI_SEG_POS, + varying ? bi_varying_offset(b->shader, instr) : 0); + } else if (immediate) { + bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b), + regfmt, imm_index); + bi_emit_split_i32(b, a, address, 3); - bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); - } else { - bi_index idx = - bi_iadd_u32(b, - bi_src_index(nir_get_io_offset_src(instr)), - bi_imm_u32(nir_intrinsic_base(instr)), - false); - bi_index address = bi_lea_attr(b, - bi_vertex_id(b), bi_instance_id(b), - idx, regfmt); - bi_emit_split_i32(b, a, address, 3); + bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); + } else { + bi_index idx = bi_iadd_u32(b, bi_src_index(nir_get_io_offset_src(instr)), + bi_imm_u32(nir_intrinsic_base(instr)), false); + bi_index address = + bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt); + bi_emit_split_i32(b, a, address, 3); - bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); - } + bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); + } } static void bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr) { - nir_src *offset = nir_get_io_offset_src(instr); + nir_src *offset = nir_get_io_offset_src(instr); - bool offset_is_const = nir_src_is_const(*offset); - bi_index dyn_offset = bi_src_index(offset); - uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0; + bool offset_is_const = nir_src_is_const(*offset); + bi_index dyn_offset = bi_src_index(offset); + uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0; - bi_load_ubo_to(b, instr->num_components * nir_dest_bit_size(instr->dest), - bi_dest_index(&instr->dest), offset_is_const ? - bi_imm_u32(const_offset) : dyn_offset, - bi_src_index(&instr->src[0])); + bi_load_ubo_to(b, instr->num_components * nir_dest_bit_size(instr->dest), + bi_dest_index(&instr->dest), + offset_is_const ? bi_imm_u32(const_offset) : dyn_offset, + bi_src_index(&instr->src[0])); } static void bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr) { - assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms"); + assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms"); - nir_src *offset = &instr->src[0]; - assert(nir_src_is_const(*offset) && "no indirect push constants"); - uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); - assert((base & 3) == 0 && "unaligned push constants"); + nir_src *offset = &instr->src[0]; + assert(nir_src_is_const(*offset) && "no indirect push constants"); + uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); + assert((base & 3) == 0 && "unaligned push constants"); - unsigned bits = nir_dest_bit_size(instr->dest) * - nir_dest_num_components(instr->dest); + unsigned bits = + nir_dest_bit_size(instr->dest) * nir_dest_num_components(instr->dest); - unsigned n = DIV_ROUND_UP(bits, 32); - assert(n <= 4); - bi_index channels[4] = { bi_null() }; + unsigned n = DIV_ROUND_UP(bits, 32); + assert(n <= 4); + bi_index channels[4] = {bi_null()}; - for (unsigned i = 0; i < n; ++i) { - unsigned word = (base >> 2) + i; + for (unsigned i = 0; i < n; ++i) { + unsigned word = (base >> 2) + i; - channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1); - } + channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1); + } - bi_emit_collect_to(b, bi_dest_index(&instr->dest), channels, n); + bi_emit_collect_to(b, bi_dest_index(&instr->dest), channels, n); } static bi_index bi_addr_high(bi_builder *b, nir_src *src) { - return (nir_src_bit_size(*src) == 64) ? - bi_extract(b, bi_src_index(src), 1) : bi_zero(); + return (nir_src_bit_size(*src) == 64) ? bi_extract(b, bi_src_index(src), 1) + : bi_zero(); } static void -bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, enum bi_seg seg, int16_t *offset) +bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, + enum bi_seg seg, int16_t *offset) { - /* Not needed on Bifrost or for global accesses */ - if (b->shader->arch < 9 || seg == BI_SEG_NONE) - return; + /* Not needed on Bifrost or for global accesses */ + if (b->shader->arch < 9 || seg == BI_SEG_NONE) + return; - /* There is no segment modifier on Valhall. Instead, we need to - * emit the arithmetic ourselves. We do have an offset - * available, which saves an instruction for constant offsets. - */ - bool wls = (seg == BI_SEG_WLS); - assert(wls || (seg == BI_SEG_TL)); + /* There is no segment modifier on Valhall. Instead, we need to + * emit the arithmetic ourselves. We do have an offset + * available, which saves an instruction for constant offsets. + */ + bool wls = (seg == BI_SEG_WLS); + assert(wls || (seg == BI_SEG_TL)); - enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR; + enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR; - bi_index base_lo = bi_fau(fau, false); + bi_index base_lo = bi_fau(fau, false); - if (offset && addr_lo->type == BI_INDEX_CONSTANT && addr_lo->value == (int16_t) addr_lo->value) { - *offset = addr_lo->value; - *addr_lo = base_lo; - } else { - *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false); - } + if (offset && addr_lo->type == BI_INDEX_CONSTANT && + addr_lo->value == (int16_t)addr_lo->value) { + *offset = addr_lo->value; + *addr_lo = base_lo; + } else { + *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false); + } - /* Do not allow overflow for WLS or TLS */ - *addr_hi = bi_fau(fau, true); + /* Do not allow overflow for WLS or TLS */ + *addr_hi = bi_fau(fau, true); } static void bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) { - int16_t offset = 0; - unsigned bits = instr->num_components * nir_dest_bit_size(instr->dest); - bi_index dest = bi_dest_index(&instr->dest); - bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0); - bi_index addr_hi = bi_addr_high(b, &instr->src[0]); + int16_t offset = 0; + unsigned bits = instr->num_components * nir_dest_bit_size(instr->dest); + bi_index dest = bi_dest_index(&instr->dest); + bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0); + bi_index addr_hi = bi_addr_high(b, &instr->src[0]); - bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); + bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); - bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset); - bi_emit_cached_split(b, dest, bits); + bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset); + bi_emit_cached_split(b, dest, bits); } static void bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) { - /* Require contiguous masks, gauranteed by nir_lower_wrmasks */ - assert(nir_intrinsic_write_mask(instr) == - BITFIELD_MASK(instr->num_components)); + /* Require contiguous masks, gauranteed by nir_lower_wrmasks */ + assert(nir_intrinsic_write_mask(instr) == + BITFIELD_MASK(instr->num_components)); - int16_t offset = 0; - bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0); - bi_index addr_hi = bi_addr_high(b, &instr->src[1]); + int16_t offset = 0; + bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0); + bi_index addr_hi = bi_addr_high(b, &instr->src[1]); - bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); + bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); - bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]), - bi_src_index(&instr->src[0]), - addr_lo, addr_hi, seg, offset); + bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]), + bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset); } /* Exchanges the staging register with memory */ static void -bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg) +bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, + enum bi_seg seg) { - assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); + assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); - unsigned sz = nir_src_bit_size(*arg); - assert(sz == 32 || sz == 64); + unsigned sz = nir_src_bit_size(*arg); + assert(sz == 32 || sz == 64); - bi_index data = bi_src_index(arg); + bi_index data = bi_src_index(arg); - bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); + bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); - if (b->shader->arch >= 9) - bi_handle_segment(b, &addr, &addr_hi, seg, NULL); - else if (seg == BI_SEG_WLS) - addr_hi = bi_zero(); + if (b->shader->arch >= 9) + bi_handle_segment(b, &addr, &addr_hi, seg, NULL); + else if (seg == BI_SEG_WLS) + addr_hi = bi_zero(); - bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg); + bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg); } /* Exchanges the second staging register with memory if comparison with first * staging register passes */ static void -bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg) +bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, + nir_src *arg_2, enum bi_seg seg) { - assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); + assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); - /* hardware is swapped from NIR */ - bi_index src0 = bi_src_index(arg_2); - bi_index src1 = bi_src_index(arg_1); + /* hardware is swapped from NIR */ + bi_index src0 = bi_src_index(arg_2); + bi_index src1 = bi_src_index(arg_1); - unsigned sz = nir_src_bit_size(*arg_1); - assert(sz == 32 || sz == 64); + unsigned sz = nir_src_bit_size(*arg_1); + assert(sz == 32 || sz == 64); - bi_index data_words[] = { - bi_extract(b, src0, 0), - sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1), + bi_index data_words[] = { + bi_extract(b, src0, 0), + sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1), - /* 64-bit */ - bi_extract(b, src1, 0), - sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1), - }; + /* 64-bit */ + bi_extract(b, src1, 0), + sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1), + }; - bi_index in = bi_temp(b->shader); - bi_emit_collect_to(b, in, data_words, 2 * (sz / 32)); - bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); + bi_index in = bi_temp(b->shader); + bi_emit_collect_to(b, in, data_words, 2 * (sz / 32)); + bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); - if (b->shader->arch >= 9) - bi_handle_segment(b, &addr, &addr_hi, seg, NULL); - else if (seg == BI_SEG_WLS) - addr_hi = bi_zero(); + if (b->shader->arch >= 9) + bi_handle_segment(b, &addr, &addr_hi, seg, NULL); + else if (seg == BI_SEG_WLS) + addr_hi = bi_zero(); - bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg); - bi_emit_cached_split(b, out, sz); + bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg); + bi_emit_cached_split(b, out, sz); - bi_index inout_words[] = { - bi_extract(b, out, 0), - sz == 64 ? bi_extract(b, out, 1) : bi_null() - }; + bi_index inout_words[] = {bi_extract(b, out, 0), + sz == 64 ? bi_extract(b, out, 1) : bi_null()}; - bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32); + bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32); } /* Extracts an atomic opcode */ @@ -1274,50 +1253,50 @@ bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, static enum bi_atom_opc bi_atom_opc_for_nir(nir_intrinsic_op op) { - switch (op) { - case nir_intrinsic_global_atomic_add: - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_image_atomic_add: - return BI_ATOM_OPC_AADD; + switch (op) { + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_image_atomic_add: + return BI_ATOM_OPC_AADD; - case nir_intrinsic_global_atomic_imin: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_image_atomic_imin: - return BI_ATOM_OPC_ASMIN; + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_image_atomic_imin: + return BI_ATOM_OPC_ASMIN; - case nir_intrinsic_global_atomic_umin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_image_atomic_umin: - return BI_ATOM_OPC_AUMIN; + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_image_atomic_umin: + return BI_ATOM_OPC_AUMIN; - case nir_intrinsic_global_atomic_imax: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_image_atomic_imax: - return BI_ATOM_OPC_ASMAX; + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_image_atomic_imax: + return BI_ATOM_OPC_ASMAX; - case nir_intrinsic_global_atomic_umax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_image_atomic_umax: - return BI_ATOM_OPC_AUMAX; + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_image_atomic_umax: + return BI_ATOM_OPC_AUMAX; - case nir_intrinsic_global_atomic_and: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_image_atomic_and: - return BI_ATOM_OPC_AAND; + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_image_atomic_and: + return BI_ATOM_OPC_AAND; - case nir_intrinsic_global_atomic_or: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_image_atomic_or: - return BI_ATOM_OPC_AOR; + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_image_atomic_or: + return BI_ATOM_OPC_AOR; - case nir_intrinsic_global_atomic_xor: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_image_atomic_xor: - return BI_ATOM_OPC_AXOR; + case nir_intrinsic_global_atomic_xor: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_image_atomic_xor: + return BI_ATOM_OPC_AXOR; - default: - unreachable("Unexpected computational atomic"); - } + default: + unreachable("Unexpected computational atomic"); + } } /* Optimized unary atomics are available with an implied #1 argument */ @@ -1325,30 +1304,30 @@ bi_atom_opc_for_nir(nir_intrinsic_op op) static bool bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out) { - /* Check we have a compatible constant */ - if (arg.type != BI_INDEX_CONSTANT) - return false; + /* Check we have a compatible constant */ + if (arg.type != BI_INDEX_CONSTANT) + return false; - if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD))) - return false; + if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD))) + return false; - /* Check for a compatible operation */ - switch (op) { - case BI_ATOM_OPC_AADD: - *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC; - return true; - case BI_ATOM_OPC_ASMAX: - *out = BI_ATOM_OPC_ASMAX1; - return true; - case BI_ATOM_OPC_AUMAX: - *out = BI_ATOM_OPC_AUMAX1; - return true; - case BI_ATOM_OPC_AOR: - *out = BI_ATOM_OPC_AOR1; - return true; - default: - return false; - } + /* Check for a compatible operation */ + switch (op) { + case BI_ATOM_OPC_AADD: + *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC; + return true; + case BI_ATOM_OPC_ASMAX: + *out = BI_ATOM_OPC_ASMAX1; + return true; + case BI_ATOM_OPC_AUMAX: + *out = BI_ATOM_OPC_AUMAX1; + return true; + case BI_ATOM_OPC_AOR: + *out = BI_ATOM_OPC_AOR1; + return true; + default: + return false; + } } /* @@ -1364,172 +1343,173 @@ static bi_index bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx, unsigned coord_comps, bool is_array) { - assert(coord_comps > 0 && coord_comps <= 3); + assert(coord_comps > 0 && coord_comps <= 3); - if (src_idx == 0) { - if (coord_comps == 1 || (coord_comps == 2 && is_array)) - return bi_extract(b, coord, 0); - else - return bi_mkvec_v2i16(b, - bi_half(bi_extract(b, coord, 0), false), - bi_half(bi_extract(b, coord, 1), false)); - } else { - if (coord_comps == 3 && b->shader->arch >= 9) - return bi_mkvec_v2i16(b, bi_imm_u16(0), - bi_half(bi_extract(b, coord, 2), false)); - else if (coord_comps == 2 && is_array && b->shader->arch >= 9) - return bi_mkvec_v2i16(b, bi_imm_u16(0), - bi_half(bi_extract(b, coord, 1), false)); - else if (coord_comps == 3) - return bi_extract(b, coord, 2); - else if (coord_comps == 2 && is_array) - return bi_extract(b, coord, 1); - else - return bi_zero(); - } + if (src_idx == 0) { + if (coord_comps == 1 || (coord_comps == 2 && is_array)) + return bi_extract(b, coord, 0); + else + return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false), + bi_half(bi_extract(b, coord, 1), false)); + } else { + if (coord_comps == 3 && b->shader->arch >= 9) + return bi_mkvec_v2i16(b, bi_imm_u16(0), + bi_half(bi_extract(b, coord, 2), false)); + else if (coord_comps == 2 && is_array && b->shader->arch >= 9) + return bi_mkvec_v2i16(b, bi_imm_u16(0), + bi_half(bi_extract(b, coord, 1), false)); + else if (coord_comps == 3) + return bi_extract(b, coord, 2); + else if (coord_comps == 2 && is_array) + return bi_extract(b, coord, 1); + else + return bi_zero(); + } } static bi_index bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr) { - nir_src src = instr->src[0]; - bi_index index = bi_src_index(&src); - bi_context *ctx = b->shader; + nir_src src = instr->src[0]; + bi_index index = bi_src_index(&src); + bi_context *ctx = b->shader; - /* Images come after vertex attributes, so handle an explicit offset */ - unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ? - util_bitcount64(ctx->nir->info.inputs_read) : 0; + /* Images come after vertex attributes, so handle an explicit offset */ + unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) + ? util_bitcount64(ctx->nir->info.inputs_read) + : 0; - if (offset == 0) - return index; - else if (nir_src_is_const(src)) - return bi_imm_u32(nir_src_as_uint(src) + offset); - else - return bi_iadd_u32(b, index, bi_imm_u32(offset), false); + if (offset == 0) + return index; + else if (nir_src_is_const(src)) + return bi_imm_u32(nir_src_as_uint(src) + offset); + else + return bi_iadd_u32(b, index, bi_imm_u32(offset), false); } static void bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr) { - enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); - unsigned coord_comps = nir_image_intrinsic_coord_components(instr); - bool array = nir_intrinsic_image_array(instr); - ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); + enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); + unsigned coord_comps = nir_image_intrinsic_coord_components(instr); + bool array = nir_intrinsic_image_array(instr); + ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); - bi_index coords = bi_src_index(&instr->src[1]); - bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); - bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); - bi_index dest = bi_dest_index(&instr->dest); - enum bi_register_format regfmt = bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)); - enum bi_vecsize vecsize = instr->num_components - 1; + bi_index coords = bi_src_index(&instr->src[1]); + bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); + bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); + bi_index dest = bi_dest_index(&instr->dest); + enum bi_register_format regfmt = + bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)); + enum bi_vecsize vecsize = instr->num_components - 1; - /* TODO: MSAA */ - assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); + /* TODO: MSAA */ + assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); - if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { - bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, - nir_src_as_uint(instr->src[0])); + if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { + bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, + nir_src_as_uint(instr->src[0])); - I->table = PAN_TABLE_IMAGE; - } else if (b->shader->arch >= 9) { - unreachable("Indirect images on Valhall not yet supported"); - } else { - bi_ld_attr_tex_to(b, dest, xy, zw, - bi_emit_image_index(b, instr), regfmt, - vecsize); - } + I->table = PAN_TABLE_IMAGE; + } else if (b->shader->arch >= 9) { + unreachable("Indirect images on Valhall not yet supported"); + } else { + bi_ld_attr_tex_to(b, dest, xy, zw, bi_emit_image_index(b, instr), regfmt, + vecsize); + } - bi_split_dest(b, instr->dest); + bi_split_dest(b, instr->dest); } static bi_index bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr) { - enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); - bool array = nir_intrinsic_image_array(instr); - ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); - unsigned coord_comps = nir_image_intrinsic_coord_components(instr); + enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); + bool array = nir_intrinsic_image_array(instr); + ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); + unsigned coord_comps = nir_image_intrinsic_coord_components(instr); - /* TODO: MSAA */ - assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); + /* TODO: MSAA */ + assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); - enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ? - bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) : - BI_REGISTER_FORMAT_AUTO; + enum bi_register_format type = + (instr->intrinsic == nir_intrinsic_image_store) + ? bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) + : BI_REGISTER_FORMAT_AUTO; - bi_index coords = bi_src_index(&instr->src[1]); - bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); - bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); - bi_index dest = bi_temp(b->shader); + bi_index coords = bi_src_index(&instr->src[1]); + bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); + bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); + bi_index dest = bi_temp(b->shader); - if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { - bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, - nir_src_as_uint(instr->src[0])); + if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { + bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, + nir_src_as_uint(instr->src[0])); - I->table = PAN_TABLE_IMAGE; - } else if (b->shader->arch >= 9) { - unreachable("Indirect images on Valhall not yet supported"); - } else { - bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw, - bi_emit_image_index(b, instr), type); + I->table = PAN_TABLE_IMAGE; + } else if (b->shader->arch >= 9) { + unreachable("Indirect images on Valhall not yet supported"); + } else { + bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw, + bi_emit_image_index(b, instr), type); - /* LEA_ATTR_TEX defaults to the secondary attribute table, but - * our ABI has all images in the primary attribute table - */ - I->table = BI_TABLE_ATTRIBUTE_1; - } + /* LEA_ATTR_TEX defaults to the secondary attribute table, but + * our ABI has all images in the primary attribute table + */ + I->table = BI_TABLE_ATTRIBUTE_1; + } - bi_emit_cached_split(b, dest, 3 * 32); - return dest; + bi_emit_cached_split(b, dest, 3 * 32); + return dest; } static void bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr) { - bi_index a[4] = { bi_null() }; - bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3); + bi_index a[4] = {bi_null()}; + bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3); - /* Due to SPIR-V limitations, the source type is not fully reliable: it - * reports uint32 even for write_imagei. This causes an incorrect - * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32 - * instead, which will match per the OpenCL spec. Of course this does - * not work for 16-bit stores, but those are not available in OpenCL. - */ - nir_alu_type T = nir_intrinsic_src_type(instr); - assert(nir_alu_type_get_type_size(T) == 32); + /* Due to SPIR-V limitations, the source type is not fully reliable: it + * reports uint32 even for write_imagei. This causes an incorrect + * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32 + * instead, which will match per the OpenCL spec. Of course this does + * not work for 16-bit stores, but those are not available in OpenCL. + */ + nir_alu_type T = nir_intrinsic_src_type(instr); + assert(nir_alu_type_get_type_size(T) == 32); - bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2], - BI_REGISTER_FORMAT_AUTO, - instr->num_components - 1); + bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2], + BI_REGISTER_FORMAT_AUTO, instr->num_components - 1); } static void -bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, - bi_index addr, bi_index arg, nir_intrinsic_op intrinsic) +bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, bi_index addr, bi_index arg, + nir_intrinsic_op intrinsic) { - enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic); - enum bi_atom_opc post_opc = opc; - bool bifrost = b->shader->arch <= 8; + enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic); + enum bi_atom_opc post_opc = opc; + bool bifrost = b->shader->arch <= 8; - /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't - * take any vector but can still output in RETURN mode */ - bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst; - unsigned sr_count = bifrost ? 2 : 1; + /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't + * take any vector but can still output in RETURN mode */ + bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst; + unsigned sr_count = bifrost ? 2 : 1; - /* Generate either ATOM or ATOM1 as required */ - if (bi_promote_atom_c1(opc, arg, &opc)) { - bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0), - bi_extract(b, addr, 1), opc, sr_count); - } else { - bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0), - bi_extract(b, addr, 1), opc, sr_count); - } + /* Generate either ATOM or ATOM1 as required */ + if (bi_promote_atom_c1(opc, arg, &opc)) { + bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0), + bi_extract(b, addr, 1), opc, sr_count); + } else { + bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0), + bi_extract(b, addr, 1), opc, sr_count); + } - if (bifrost) { - /* Post-process it */ - bi_emit_cached_split_i32(b, tmp_dest, 2); - bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0), bi_extract(b, tmp_dest, 1), post_opc); - } + if (bifrost) { + /* Post-process it */ + bi_emit_cached_split_i32(b, tmp_dest, 2); + bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0), + bi_extract(b, tmp_dest, 1), post_opc); + } } /* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5 @@ -1540,475 +1520,474 @@ bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, static void bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr) { - bi_index src[4] = {}; + bi_index src[4] = {}; - for (unsigned i = 0; i < 2; ++i) { - src[i] = bi_fadd_f32(b, - bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)), - bi_imm_f32(0.5f)); - } + for (unsigned i = 0; i < 2; ++i) { + src[i] = bi_fadd_f32(b, bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)), + bi_imm_f32(0.5f)); + } - for (unsigned i = 0; i < 2; ++i) { - src[2 + i] = bi_ld_var_special(b, bi_zero(), - BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER, - BI_UPDATE_CLOBBER, - (i == 0) ? BI_VARYING_NAME_FRAG_Z : - BI_VARYING_NAME_FRAG_W, - BI_VECSIZE_NONE); - } + for (unsigned i = 0; i < 2; ++i) { + src[2 + i] = bi_ld_var_special( + b, bi_zero(), BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER, + BI_UPDATE_CLOBBER, + (i == 0) ? BI_VARYING_NAME_FRAG_Z : BI_VARYING_NAME_FRAG_W, + BI_VECSIZE_NONE); + } - bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32); + bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32); } static void bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr) { - bi_index dest = bi_dest_index(&instr->dest); - nir_alu_type T = nir_intrinsic_dest_type(instr); - enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); - unsigned rt = b->shader->inputs->blend.rt; - unsigned size = nir_dest_bit_size(instr->dest); - unsigned nr = instr->num_components; + bi_index dest = bi_dest_index(&instr->dest); + nir_alu_type T = nir_intrinsic_dest_type(instr); + enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); + unsigned rt = b->shader->inputs->blend.rt; + unsigned size = nir_dest_bit_size(instr->dest); + unsigned nr = instr->num_components; - /* Get the render target */ - if (!b->shader->inputs->is_blend) { - nir_io_semantics sem = nir_intrinsic_io_semantics(instr); - unsigned loc = sem.location; - assert(loc >= FRAG_RESULT_DATA0); - rt = (loc - FRAG_RESULT_DATA0); - } + /* Get the render target */ + if (!b->shader->inputs->is_blend) { + nir_io_semantics sem = nir_intrinsic_io_semantics(instr); + unsigned loc = sem.location; + assert(loc >= FRAG_RESULT_DATA0); + rt = (loc - FRAG_RESULT_DATA0); + } - bi_index desc = b->shader->inputs->is_blend ? - bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) : - b->shader->inputs->bifrost.static_rt_conv ? - bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) : - bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0); + bi_index desc = + b->shader->inputs->is_blend + ? bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) + : b->shader->inputs->bifrost.static_rt_conv + ? bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) + : bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0); - bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc, - regfmt, nr - 1); - bi_emit_cached_split(b, dest, size * nr); + bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc, regfmt, + nr - 1); + bi_emit_cached_split(b, dest, size * nr); } static void bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) { - bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ? - bi_dest_index(&instr->dest) : bi_null(); - gl_shader_stage stage = b->shader->stage; + bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest + ? bi_dest_index(&instr->dest) + : bi_null(); + gl_shader_stage stage = b->shader->stage; - switch (instr->intrinsic) { - case nir_intrinsic_load_barycentric_pixel: - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - case nir_intrinsic_load_barycentric_at_sample: - case nir_intrinsic_load_barycentric_at_offset: - /* handled later via load_vary */ - break; - case nir_intrinsic_load_interpolated_input: - case nir_intrinsic_load_input: - if (b->shader->inputs->is_blend) - bi_emit_load_blend_input(b, instr); - else if (stage == MESA_SHADER_FRAGMENT) - bi_emit_load_vary(b, instr); - else if (stage == MESA_SHADER_VERTEX) - bi_emit_load_attr(b, instr); - else - unreachable("Unsupported shader stage"); - break; + switch (instr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: + /* handled later via load_vary */ + break; + case nir_intrinsic_load_interpolated_input: + case nir_intrinsic_load_input: + if (b->shader->inputs->is_blend) + bi_emit_load_blend_input(b, instr); + else if (stage == MESA_SHADER_FRAGMENT) + bi_emit_load_vary(b, instr); + else if (stage == MESA_SHADER_VERTEX) + bi_emit_load_attr(b, instr); + else + unreachable("Unsupported shader stage"); + break; - case nir_intrinsic_store_output: - if (stage == MESA_SHADER_FRAGMENT) - bi_emit_fragment_out(b, instr); - else if (stage == MESA_SHADER_VERTEX) - bi_emit_store_vary(b, instr); - else - unreachable("Unsupported shader stage"); - break; + case nir_intrinsic_store_output: + if (stage == MESA_SHADER_FRAGMENT) + bi_emit_fragment_out(b, instr); + else if (stage == MESA_SHADER_VERTEX) + bi_emit_store_vary(b, instr); + else + unreachable("Unsupported shader stage"); + break; - case nir_intrinsic_store_combined_output_pan: - assert(stage == MESA_SHADER_FRAGMENT); - bi_emit_fragment_out(b, instr); - break; + case nir_intrinsic_store_combined_output_pan: + assert(stage == MESA_SHADER_FRAGMENT); + bi_emit_fragment_out(b, instr); + break; - case nir_intrinsic_load_ubo: - bi_emit_load_ubo(b, instr); - break; + case nir_intrinsic_load_ubo: + bi_emit_load_ubo(b, instr); + break; - case nir_intrinsic_load_push_constant: - bi_emit_load_push_constant(b, instr); - break; + case nir_intrinsic_load_push_constant: + bi_emit_load_push_constant(b, instr); + break; - case nir_intrinsic_load_global: - case nir_intrinsic_load_global_constant: - bi_emit_load(b, instr, BI_SEG_NONE); - break; + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: + bi_emit_load(b, instr, BI_SEG_NONE); + break; - case nir_intrinsic_store_global: - bi_emit_store(b, instr, BI_SEG_NONE); - break; + case nir_intrinsic_store_global: + bi_emit_store(b, instr, BI_SEG_NONE); + break; - case nir_intrinsic_load_scratch: - bi_emit_load(b, instr, BI_SEG_TL); - break; + case nir_intrinsic_load_scratch: + bi_emit_load(b, instr, BI_SEG_TL); + break; - case nir_intrinsic_store_scratch: - bi_emit_store(b, instr, BI_SEG_TL); - break; + case nir_intrinsic_store_scratch: + bi_emit_store(b, instr, BI_SEG_TL); + break; - case nir_intrinsic_load_shared: - bi_emit_load(b, instr, BI_SEG_WLS); - break; + case nir_intrinsic_load_shared: + bi_emit_load(b, instr, BI_SEG_WLS); + break; - case nir_intrinsic_store_shared: - bi_emit_store(b, instr, BI_SEG_WLS); - break; + case nir_intrinsic_store_shared: + bi_emit_store(b, instr, BI_SEG_WLS); + break; - /* Blob doesn't seem to do anything for memory barriers, note +BARRIER - * is illegal in fragment shaders */ - case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - case nir_intrinsic_group_memory_barrier: - break; + /* Blob doesn't seem to do anything for memory barriers, note +BARRIER + * is illegal in fragment shaders */ + case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_shared: + case nir_intrinsic_group_memory_barrier: + break; - case nir_intrinsic_control_barrier: - assert(b->shader->stage != MESA_SHADER_FRAGMENT); - bi_barrier(b); - break; + case nir_intrinsic_control_barrier: + assert(b->shader->stage != MESA_SHADER_FRAGMENT); + bi_barrier(b); + break; - case nir_intrinsic_scoped_barrier: - assert(b->shader->stage != MESA_SHADER_FRAGMENT); - assert(nir_intrinsic_memory_scope(instr) > NIR_SCOPE_SUBGROUP && - "todo: subgroup barriers (different divergence rules)"); + case nir_intrinsic_scoped_barrier: + assert(b->shader->stage != MESA_SHADER_FRAGMENT); + assert(nir_intrinsic_memory_scope(instr) > NIR_SCOPE_SUBGROUP && + "todo: subgroup barriers (different divergence rules)"); - bi_barrier(b); - break; + bi_barrier(b); + break; - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: { - assert(nir_src_bit_size(instr->src[1]) == 32); + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: { + assert(nir_src_bit_size(instr->src[1]) == 32); - bi_index addr = bi_src_index(&instr->src[0]); - bi_index addr_hi; + bi_index addr = bi_src_index(&instr->src[0]); + bi_index addr_hi; - if (b->shader->arch >= 9) { - bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL); - addr = bi_collect_v2i32(b, addr, addr_hi); - } else { - addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS); - bi_emit_cached_split(b, addr, 64); - } + if (b->shader->arch >= 9) { + bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL); + addr = bi_collect_v2i32(b, addr, addr_hi); + } else { + addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS); + bi_emit_cached_split(b, addr, 64); + } - bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), - instr->intrinsic); - bi_split_dest(b, instr->dest); - break; - } + bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), + instr->intrinsic); + bi_split_dest(b, instr->dest); + break; + } - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_xor: - assert(nir_src_bit_size(instr->src[3]) == 32); + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + assert(nir_src_bit_size(instr->src[3]) == 32); - bi_emit_atomic_i32_to(b, dst, - bi_emit_lea_image(b, instr), - bi_src_index(&instr->src[3]), - instr->intrinsic); - bi_split_dest(b, instr->dest); - break; + bi_emit_atomic_i32_to(b, dst, bi_emit_lea_image(b, instr), + bi_src_index(&instr->src[3]), instr->intrinsic); + bi_split_dest(b, instr->dest); + break; - case nir_intrinsic_global_atomic_add: - case nir_intrinsic_global_atomic_imin: - case nir_intrinsic_global_atomic_umin: - case nir_intrinsic_global_atomic_imax: - case nir_intrinsic_global_atomic_umax: - case nir_intrinsic_global_atomic_and: - case nir_intrinsic_global_atomic_or: - case nir_intrinsic_global_atomic_xor: - assert(nir_src_bit_size(instr->src[1]) == 32); + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_xor: + assert(nir_src_bit_size(instr->src[1]) == 32); - bi_emit_atomic_i32_to(b, dst, - bi_src_index(&instr->src[0]), - bi_src_index(&instr->src[1]), - instr->intrinsic); + bi_emit_atomic_i32_to(b, dst, bi_src_index(&instr->src[0]), + bi_src_index(&instr->src[1]), instr->intrinsic); - bi_split_dest(b, instr->dest); - break; + bi_split_dest(b, instr->dest); + break; - case nir_intrinsic_image_load: - bi_emit_image_load(b, instr); - break; + case nir_intrinsic_image_load: + bi_emit_image_load(b, instr); + break; - case nir_intrinsic_image_store: - bi_emit_image_store(b, instr); - break; + case nir_intrinsic_image_store: + bi_emit_image_store(b, instr); + break; - case nir_intrinsic_global_atomic_exchange: - bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), - &instr->src[1], BI_SEG_NONE); - bi_split_dest(b, instr->dest); - break; + case nir_intrinsic_global_atomic_exchange: + bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1], + BI_SEG_NONE); + bi_split_dest(b, instr->dest); + break; - case nir_intrinsic_image_atomic_exchange: - bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr), - &instr->src[3], BI_SEG_NONE); - bi_split_dest(b, instr->dest); - break; + case nir_intrinsic_image_atomic_exchange: + bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr), &instr->src[3], + BI_SEG_NONE); + bi_split_dest(b, instr->dest); + break; - case nir_intrinsic_shared_atomic_exchange: - bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), - &instr->src[1], BI_SEG_WLS); - bi_split_dest(b, instr->dest); - break; + case nir_intrinsic_shared_atomic_exchange: + bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1], + BI_SEG_WLS); + bi_split_dest(b, instr->dest); + break; - case nir_intrinsic_global_atomic_comp_swap: - bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), - &instr->src[1], &instr->src[2], BI_SEG_NONE); - bi_split_dest(b, instr->dest); - break; + case nir_intrinsic_global_atomic_comp_swap: + bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1], + &instr->src[2], BI_SEG_NONE); + bi_split_dest(b, instr->dest); + break; - case nir_intrinsic_image_atomic_comp_swap: - bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr), - &instr->src[3], &instr->src[4], BI_SEG_NONE); - bi_split_dest(b, instr->dest); - break; + case nir_intrinsic_image_atomic_comp_swap: + bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr), &instr->src[3], + &instr->src[4], BI_SEG_NONE); + bi_split_dest(b, instr->dest); + break; - case nir_intrinsic_shared_atomic_comp_swap: - bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), - &instr->src[1], &instr->src[2], BI_SEG_WLS); - bi_split_dest(b, instr->dest); - break; + case nir_intrinsic_shared_atomic_comp_swap: + bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1], + &instr->src[2], BI_SEG_WLS); + bi_split_dest(b, instr->dest); + break; - case nir_intrinsic_load_frag_coord: - bi_emit_load_frag_coord(b, instr); - break; + case nir_intrinsic_load_frag_coord: + bi_emit_load_frag_coord(b, instr); + break; - case nir_intrinsic_load_output: - bi_emit_ld_tile(b, instr); - break; + case nir_intrinsic_load_output: + bi_emit_ld_tile(b, instr); + break; - case nir_intrinsic_discard_if: - bi_discard_b32(b, bi_src_index(&instr->src[0])); - break; + case nir_intrinsic_discard_if: + bi_discard_b32(b, bi_src_index(&instr->src[0])); + break; - case nir_intrinsic_discard: - bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ); - break; + case nir_intrinsic_discard: + bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ); + break; - case nir_intrinsic_load_ssbo_address: - case nir_intrinsic_load_xfb_address: - bi_load_sysval_nir(b, instr, 2, 0); - break; + case nir_intrinsic_load_ssbo_address: + case nir_intrinsic_load_xfb_address: + bi_load_sysval_nir(b, instr, 2, 0); + break; - case nir_intrinsic_load_work_dim: - case nir_intrinsic_load_num_vertices: - case nir_intrinsic_load_first_vertex: - case nir_intrinsic_load_draw_id: - bi_load_sysval_nir(b, instr, 1, 0); - break; + case nir_intrinsic_load_work_dim: + case nir_intrinsic_load_num_vertices: + case nir_intrinsic_load_first_vertex: + case nir_intrinsic_load_draw_id: + bi_load_sysval_nir(b, instr, 1, 0); + break; - case nir_intrinsic_load_base_vertex: - bi_load_sysval_nir(b, instr, 1, 4); - break; + case nir_intrinsic_load_base_vertex: + bi_load_sysval_nir(b, instr, 1, 4); + break; - case nir_intrinsic_load_base_instance: - case nir_intrinsic_get_ssbo_size: - bi_load_sysval_nir(b, instr, 1, 8); - break; + case nir_intrinsic_load_base_instance: + case nir_intrinsic_get_ssbo_size: + bi_load_sysval_nir(b, instr, 1, 8); + break; - case nir_intrinsic_load_viewport_scale: - case nir_intrinsic_load_viewport_offset: - case nir_intrinsic_load_num_workgroups: - case nir_intrinsic_load_workgroup_size: - bi_load_sysval_nir(b, instr, 3, 0); - break; + case nir_intrinsic_load_viewport_scale: + case nir_intrinsic_load_viewport_offset: + case nir_intrinsic_load_num_workgroups: + case nir_intrinsic_load_workgroup_size: + bi_load_sysval_nir(b, instr, 3, 0); + break; - case nir_intrinsic_image_size: - bi_load_sysval_nir(b, instr, - nir_dest_num_components(instr->dest), 0); - break; + case nir_intrinsic_image_size: + bi_load_sysval_nir(b, instr, nir_dest_num_components(instr->dest), 0); + break; - case nir_intrinsic_load_blend_const_color_rgba: - bi_load_sysval_nir(b, instr, - nir_dest_num_components(instr->dest), 0); - break; + case nir_intrinsic_load_blend_const_color_rgba: + bi_load_sysval_nir(b, instr, nir_dest_num_components(instr->dest), 0); + break; - case nir_intrinsic_load_sample_positions_pan: - bi_collect_v2i32_to(b, dst, - bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false), - bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true)); - break; + case nir_intrinsic_load_sample_positions_pan: + bi_collect_v2i32_to(b, dst, bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false), + bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true)); + break; - case nir_intrinsic_load_sample_mask_in: - /* r61[0:15] contains the coverage bitmap */ - bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false)); - break; + case nir_intrinsic_load_sample_mask_in: + /* r61[0:15] contains the coverage bitmap */ + bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false)); + break; - case nir_intrinsic_load_sample_id: - bi_load_sample_id_to(b, dst); - break; + case nir_intrinsic_load_sample_id: + bi_load_sample_id_to(b, dst); + break; - case nir_intrinsic_load_front_face: - /* r58 == 0 means primitive is front facing */ - bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ, - BI_RESULT_TYPE_M1); - break; + case nir_intrinsic_load_front_face: + /* r58 == 0 means primitive is front facing */ + bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ, + BI_RESULT_TYPE_M1); + break; - case nir_intrinsic_load_point_coord: - bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, - BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER, - BI_VARYING_NAME_POINT, BI_VECSIZE_V2); - bi_emit_cached_split_i32(b, dst, 2); - break; + case nir_intrinsic_load_point_coord: + bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, + BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER, + BI_VARYING_NAME_POINT, BI_VECSIZE_V2); + bi_emit_cached_split_i32(b, dst, 2); + break; - /* It appears vertex_id is zero-based with Bifrost geometry flows, but - * not with Valhall's memory-allocation IDVS geometry flow. Ostensibly - * we support the legacy geometry flow even on Valhall, so - * vertex_id_zero_based isn't a machine property for us. Don't set it, - * and lower here if needed. - */ - case nir_intrinsic_load_vertex_id: - if (b->shader->malloc_idvs) { - bi_mov_i32_to(b, dst, bi_vertex_id(b)); - } else { - bi_index first = bi_load_sysval(b, - PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS, - 1, 0); + /* It appears vertex_id is zero-based with Bifrost geometry flows, but + * not with Valhall's memory-allocation IDVS geometry flow. Ostensibly + * we support the legacy geometry flow even on Valhall, so + * vertex_id_zero_based isn't a machine property for us. Don't set it, + * and lower here if needed. + */ + case nir_intrinsic_load_vertex_id: + if (b->shader->malloc_idvs) { + bi_mov_i32_to(b, dst, bi_vertex_id(b)); + } else { + bi_index first = + bi_load_sysval(b, PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS, 1, 0); - bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false); - } + bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false); + } - break; + break; - /* We only use in our transform feedback lowering */ - case nir_intrinsic_load_vertex_id_zero_base: - assert(b->shader->nir->info.has_transform_feedback_varyings); - bi_mov_i32_to(b, dst, bi_vertex_id(b)); - break; + /* We only use in our transform feedback lowering */ + case nir_intrinsic_load_vertex_id_zero_base: + assert(b->shader->nir->info.has_transform_feedback_varyings); + bi_mov_i32_to(b, dst, bi_vertex_id(b)); + break; - case nir_intrinsic_load_instance_id: - bi_mov_i32_to(b, dst, bi_instance_id(b)); - break; + case nir_intrinsic_load_instance_id: + bi_mov_i32_to(b, dst, bi_instance_id(b)); + break; - case nir_intrinsic_load_subgroup_invocation: - bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false)); - break; + case nir_intrinsic_load_subgroup_invocation: + bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false)); + break; - case nir_intrinsic_load_local_invocation_id: - bi_collect_v3i32_to(b, dst, - bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)), - bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)), - bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0))); - break; + case nir_intrinsic_load_local_invocation_id: + bi_collect_v3i32_to(b, dst, + bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)), + bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)), + bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0))); + break; - case nir_intrinsic_load_workgroup_id: - bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58), - bi_preload(b, 59)); - break; + case nir_intrinsic_load_workgroup_id: + bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58), + bi_preload(b, 59)); + break; - case nir_intrinsic_load_global_invocation_id: - case nir_intrinsic_load_global_invocation_id_zero_base: - bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61), - bi_preload(b, 62)); - break; + case nir_intrinsic_load_global_invocation_id: + case nir_intrinsic_load_global_invocation_id_zero_base: + bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61), + bi_preload(b, 62)); + break; - case nir_intrinsic_shader_clock: - bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER); - bi_split_dest(b, instr->dest); - break; + case nir_intrinsic_shader_clock: + bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER); + bi_split_dest(b, instr->dest); + break; - default: - fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); - assert(0); - } + default: + fprintf(stderr, "Unhandled intrinsic %s\n", + nir_intrinsic_infos[instr->intrinsic].name); + assert(0); + } } static void bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr) { - /* Make sure we've been lowered */ - assert(instr->def.num_components <= (32 / instr->def.bit_size)); + /* Make sure we've been lowered */ + assert(instr->def.num_components <= (32 / instr->def.bit_size)); - /* Accumulate all the channels of the constant, as if we did an - * implicit SEL over them */ - uint32_t acc = 0; + /* Accumulate all the channels of the constant, as if we did an + * implicit SEL over them */ + uint32_t acc = 0; - for (unsigned i = 0; i < instr->def.num_components; ++i) { - unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size); - acc |= (v << (i * instr->def.bit_size)); - } + for (unsigned i = 0; i < instr->def.num_components; ++i) { + unsigned v = + nir_const_value_as_uint(instr->value[i], instr->def.bit_size); + acc |= (v << (i * instr->def.bit_size)); + } - bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc)); + bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc)); } static bi_index bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps) { - /* we don't lower modifiers until the backend */ - assert(!(src.negate || src.abs)); + /* we don't lower modifiers until the backend */ + assert(!(src.negate || src.abs)); - unsigned bitsize = nir_src_bit_size(src.src); + unsigned bitsize = nir_src_bit_size(src.src); - /* the bi_index carries the 32-bit (word) offset separate from the - * subword swizzle, first handle the offset */ + /* the bi_index carries the 32-bit (word) offset separate from the + * subword swizzle, first handle the offset */ - unsigned offset = 0; + unsigned offset = 0; - assert(bitsize == 8 || bitsize == 16 || bitsize == 32); - unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; + assert(bitsize == 8 || bitsize == 16 || bitsize == 32); + unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; - for (unsigned i = 0; i < comps; ++i) { - unsigned new_offset = (src.swizzle[i] >> subword_shift); + for (unsigned i = 0; i < comps; ++i) { + unsigned new_offset = (src.swizzle[i] >> subword_shift); - if (i > 0) - assert(offset == new_offset && "wrong vectorization"); + if (i > 0) + assert(offset == new_offset && "wrong vectorization"); - offset = new_offset; - } + offset = new_offset; + } - bi_index idx = bi_extract(b, bi_src_index(&src.src), offset); + bi_index idx = bi_extract(b, bi_src_index(&src.src), offset); - /* Compose the subword swizzle with existing (identity) swizzle */ - assert(idx.swizzle == BI_SWIZZLE_H01); + /* Compose the subword swizzle with existing (identity) swizzle */ + assert(idx.swizzle == BI_SWIZZLE_H01); - /* Bigger vectors should have been lowered */ - assert(comps <= (1 << subword_shift)); + /* Bigger vectors should have been lowered */ + assert(comps <= (1 << subword_shift)); - if (bitsize == 16) { - unsigned c0 = src.swizzle[0] & 1; - unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0; - idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1); - } else if (bitsize == 8) { - /* 8-bit vectors not yet supported */ - assert(comps == 1 && "8-bit vectors not supported"); - idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3); - } + if (bitsize == 16) { + unsigned c0 = src.swizzle[0] & 1; + unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0; + idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1); + } else if (bitsize == 8) { + /* 8-bit vectors not yet supported */ + assert(comps == 1 && "8-bit vectors not supported"); + idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3); + } - return idx; + return idx; } static enum bi_round bi_nir_round(nir_op op) { - switch (op) { - case nir_op_fround_even: return BI_ROUND_NONE; - case nir_op_ftrunc: return BI_ROUND_RTZ; - case nir_op_fceil: return BI_ROUND_RTP; - case nir_op_ffloor: return BI_ROUND_RTN; - default: unreachable("invalid nir round op"); - } + switch (op) { + case nir_op_fround_even: + return BI_ROUND_NONE; + case nir_op_ftrunc: + return BI_ROUND_RTZ; + case nir_op_fceil: + return BI_ROUND_RTP; + case nir_op_ffloor: + return BI_ROUND_RTN; + default: + unreachable("invalid nir round op"); + } } /* Convenience for lowered transcendentals */ @@ -2016,7 +1995,7 @@ bi_nir_round(nir_op op) static bi_index bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1) { - return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f)); + return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f)); } /* Approximate with FRCP_APPROX.f32 and apply a single iteration of @@ -2025,24 +2004,24 @@ bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1) static void bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0) { - bi_index x1 = bi_frcp_approx_f32(b, s0); - bi_index m = bi_frexpm_f32(b, s0, false, false); - bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false); - bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), - bi_zero(), BI_SPECIAL_N); - bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE); + bi_index x1 = bi_frcp_approx_f32(b, s0); + bi_index m = bi_frexpm_f32(b, s0, false, false); + bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false); + bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), bi_zero(), + BI_SPECIAL_N); + bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE); } static void bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0) { - bi_index x1 = bi_frsq_approx_f32(b, s0); - bi_index m = bi_frexpm_f32(b, s0, false, true); - bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true); - bi_index t1 = bi_fmul_f32(b, x1, x1); - bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0), - bi_imm_u32(-1), BI_SPECIAL_N); - bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N); + bi_index x1 = bi_frsq_approx_f32(b, s0); + bi_index m = bi_frexpm_f32(b, s0, false, true); + bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true); + bi_index t1 = bi_fmul_f32(b, x1, x1); + bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0), + bi_imm_u32(-1), BI_SPECIAL_N); + bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N); } /* More complex transcendentals, see @@ -2052,116 +2031,116 @@ bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0) static void bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0) { - bi_index t1 = bi_temp(b->shader); - bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000)); - t1_instr->clamp = BI_CLAMP_CLAMP_0_INF; + bi_index t1 = bi_temp(b->shader); + bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000)); + t1_instr->clamp = BI_CLAMP_CLAMP_0_INF; - bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000)); + bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000)); - bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2)); - a2->clamp = BI_CLAMP_CLAMP_M1_1; + bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2)); + a2->clamp = BI_CLAMP_CLAMP_M1_1; - bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE); - bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false); - bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4)); - bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635), - bi_imm_u32(0x3e75fffa)); - bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218)); - bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2); - bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), - p3, a1t, a1t, a1i, BI_SPECIAL_NONE); - x->clamp = BI_CLAMP_CLAMP_0_INF; + bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE); + bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false); + bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4)); + bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635), + bi_imm_u32(0x3e75fffa)); + bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218)); + bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2); + bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), p3, a1t, a1t, a1i, + BI_SPECIAL_NONE); + x->clamp = BI_CLAMP_CLAMP_0_INF; - bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0); - max->sem = BI_SEM_NAN_PROPAGATE; + bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0); + max->sem = BI_SEM_NAN_PROPAGATE; } static void bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base) { - /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24 - * fixed-point input */ - bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(), - bi_imm_u32(24), BI_SPECIAL_NONE); - bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale); - fixed_pt->round = BI_ROUND_NONE; // XXX + /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24 + * fixed-point input */ + bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(), + bi_imm_u32(24), BI_SPECIAL_NONE); + bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale); + fixed_pt->round = BI_ROUND_NONE; // XXX - /* Compute the result for the fixed-point input, but pass along - * the floating-point scale for correct NaN propagation */ - bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale); + /* Compute the result for the fixed-point input, but pass along + * the floating-point scale for correct NaN propagation */ + bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale); } static void bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0) { - /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */ - bi_index a1 = bi_frexpm_f32(b, s0, true, false); - bi_index ei = bi_frexpe_f32(b, s0, true, false); - bi_index ef = bi_s32_to_f32(b, ei); + /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */ + bi_index a1 = bi_frexpm_f32(b, s0, true, false); + bi_index ei = bi_frexpe_f32(b, s0, true, false); + bi_index ef = bi_s32_to_f32(b, ei); - /* xt estimates -log(r1), a coarse approximation of log(a1) */ - bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE); - bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE); + /* xt estimates -log(r1), a coarse approximation of log(a1) */ + bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE); + bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE); - /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) - - * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1), - * and then log(s0) = x1 + x2 */ - bi_index x1 = bi_fadd_f32(b, ef, xt); + /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) - + * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1), + * and then log(s0) = x1 + x2 */ + bi_index x1 = bi_fadd_f32(b, ef, xt); - /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by - * polynomial approximation around 1. The series is expressed around - * 1, so set y = (a1 * r1) - 1.0 */ - bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0)); + /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by + * polynomial approximation around 1. The series is expressed around + * 1, so set y = (a1 * r1) - 1.0 */ + bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0)); - /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate - * log_e(1 + y) by the Taylor series (lower precision than the blob): - * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */ - bi_index loge = bi_fmul_f32(b, y, - bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0))); + /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate + * log_e(1 + y) by the Taylor series (lower precision than the blob): + * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */ + bi_index loge = + bi_fmul_f32(b, y, bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0))); - bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0))); + bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0))); - /* log(s0) = x1 + x2 */ - bi_fadd_f32_to(b, dst, x1, x2); + /* log(s0) = x1 + x2 */ + bi_fadd_f32_to(b, dst, x1, x2); } static void bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0) { - bi_index frexp = bi_frexpe_f32(b, s0, true, false); - bi_index frexpi = bi_s32_to_f32(b, frexp); - bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0); - bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi); + bi_index frexp = bi_frexpe_f32(b, s0, true, false); + bi_index frexpi = bi_s32_to_f32(b, frexp); + bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0); + bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi); } static void bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) { - bi_index log2_base = bi_null(); + bi_index log2_base = bi_null(); - if (base.type == BI_INDEX_CONSTANT) { - log2_base = bi_imm_f32(log2f(uif(base.value))); - } else { - log2_base = bi_temp(b->shader); - bi_lower_flog2_32(b, log2_base, base); - } + if (base.type == BI_INDEX_CONSTANT) { + log2_base = bi_imm_f32(log2f(uif(base.value))); + } else { + log2_base = bi_temp(b->shader); + bi_lower_flog2_32(b, log2_base, base); + } - return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base)); + return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base)); } static void bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) { - bi_index log2_base = bi_null(); + bi_index log2_base = bi_null(); - if (base.type == BI_INDEX_CONSTANT) { - log2_base = bi_imm_f32(log2f(uif(base.value))); - } else { - log2_base = bi_temp(b->shader); - bi_flog2_32(b, log2_base, base); - } + if (base.type == BI_INDEX_CONSTANT) { + log2_base = bi_imm_f32(log2f(uif(base.value))); + } else { + log2_base = bi_temp(b->shader); + bi_flog2_32(b, log2_base, base); + } - return bi_fexp_32(b, dst, exp, log2_base); + return bi_fexp_32(b, dst, exp, log2_base); } /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as @@ -2181,34 +2160,32 @@ bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) static void bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos) { - /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */ - bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS); + /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */ + bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS); - /* Approximate domain error (small) */ - bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)), - MPI_OVER_TWO, s0); + /* Approximate domain error (small) */ + bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)), + MPI_OVER_TWO, s0); - /* Lookup sin(x), cos(x) */ - bi_index sinx = bi_fsin_table_u6(b, x_u6, false); - bi_index cosx = bi_fcos_table_u6(b, x_u6, false); + /* Lookup sin(x), cos(x) */ + bi_index sinx = bi_fsin_table_u6(b, x_u6, false); + bi_index cosx = bi_fcos_table_u6(b, x_u6, false); - /* e^2 / 2 */ - bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(), - bi_imm_u32(-1), BI_SPECIAL_NONE); + /* e^2 / 2 */ + bi_index e2_over_2 = + bi_fma_rscale_f32(b, e, e, bi_negzero(), bi_imm_u32(-1), BI_SPECIAL_NONE); - /* (-e^2)/2 f''(x) */ - bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2), - cos ? cosx : sinx, - bi_negzero()); + /* (-e^2)/2 f''(x) */ + bi_index quadratic = + bi_fma_f32(b, bi_neg(e2_over_2), cos ? cosx : sinx, bi_negzero()); - /* e f'(x) - (e^2/2) f''(x) */ - bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e, - cos ? bi_neg(sinx) : cosx, - quadratic); - I->clamp = BI_CLAMP_CLAMP_M1_1; + /* e f'(x) - (e^2/2) f''(x) */ + bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e, + cos ? bi_neg(sinx) : cosx, quadratic); + I->clamp = BI_CLAMP_CLAMP_M1_1; - /* f(x) + e f'(x) - (e^2/2) f''(x) */ - bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx); + /* f(x) + e f'(x) - (e^2/2) f''(x) */ + bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx); } /* @@ -2219,954 +2196,961 @@ bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos) static bi_index bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1) { - if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) { - return bi_clper_i32(b, s0, s1, - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR, - BI_SUBGROUP_SUBGROUP4); - } + if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) { + return bi_clper_i32(b, s0, s1, BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR, + BI_SUBGROUP_SUBGROUP4); + } - bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false); - bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0)); - return bi_clper_old_i32(b, s0, lane); + bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false); + bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0)); + return bi_clper_old_i32(b, s0, lane); } static enum bi_cmpf bi_translate_cmpf(nir_op op) { - switch (op) { - case nir_op_ieq8: - case nir_op_ieq16: - case nir_op_ieq32: - case nir_op_feq16: - case nir_op_feq32: - return BI_CMPF_EQ; + switch (op) { + case nir_op_ieq8: + case nir_op_ieq16: + case nir_op_ieq32: + case nir_op_feq16: + case nir_op_feq32: + return BI_CMPF_EQ; - case nir_op_ine8: - case nir_op_ine16: - case nir_op_ine32: - case nir_op_fneu16: - case nir_op_fneu32: - return BI_CMPF_NE; + case nir_op_ine8: + case nir_op_ine16: + case nir_op_ine32: + case nir_op_fneu16: + case nir_op_fneu32: + return BI_CMPF_NE; - case nir_op_ilt8: - case nir_op_ilt16: - case nir_op_ilt32: - case nir_op_flt16: - case nir_op_flt32: - case nir_op_ult8: - case nir_op_ult16: - case nir_op_ult32: - return BI_CMPF_LT; + case nir_op_ilt8: + case nir_op_ilt16: + case nir_op_ilt32: + case nir_op_flt16: + case nir_op_flt32: + case nir_op_ult8: + case nir_op_ult16: + case nir_op_ult32: + return BI_CMPF_LT; - case nir_op_ige8: - case nir_op_ige16: - case nir_op_ige32: - case nir_op_fge16: - case nir_op_fge32: - case nir_op_uge8: - case nir_op_uge16: - case nir_op_uge32: - return BI_CMPF_GE; + case nir_op_ige8: + case nir_op_ige16: + case nir_op_ige32: + case nir_op_fge16: + case nir_op_fge32: + case nir_op_uge8: + case nir_op_uge16: + case nir_op_uge32: + return BI_CMPF_GE; - default: - unreachable("invalid comparison"); - } + default: + unreachable("invalid comparison"); + } } static bool bi_nir_is_replicated(nir_alu_src *src) { - for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) { - if (src->swizzle[0] == src->swizzle[i]) - return false; - } + for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) { + if (src->swizzle[0] == src->swizzle[i]) + return false; + } - return true; + return true; } static void bi_emit_alu(bi_builder *b, nir_alu_instr *instr) { - bi_index dst = bi_dest_index(&instr->dest.dest); - unsigned srcs = nir_op_infos[instr->op].num_inputs; - unsigned sz = nir_dest_bit_size(instr->dest.dest); - unsigned comps = nir_dest_num_components(instr->dest.dest); - unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0; - - /* Indicate scalarness */ - if (sz == 16 && comps == 1) - dst.swizzle = BI_SWIZZLE_H00; - - /* First, match against the various moves in NIR. These are - * special-cased because they can operate on vectors even after - * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the - * instruction is no "bigger" than SIMD-within-a-register. These moves - * are the exceptions that need to handle swizzles specially. */ - - switch (instr->op) { - case nir_op_vec2: - case nir_op_vec3: - case nir_op_vec4: - case nir_op_vec8: - case nir_op_vec16: { - bi_index unoffset_srcs[16] = { bi_null() }; - unsigned channels[16] = { 0 }; - - for (unsigned i = 0; i < srcs; ++i) { - unoffset_srcs[i] = bi_src_index(&instr->src[i].src); - channels[i] = instr->src[i].swizzle[0]; - } - - bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz); - return; - } - - case nir_op_unpack_32_2x16: { - /* Should have been scalarized */ - assert(comps == 2 && sz == 16); - - bi_index vec = bi_src_index(&instr->src[0].src); - unsigned chan = instr->src[0].swizzle[0]; - - bi_mov_i32_to(b, dst, bi_extract(b, vec, chan)); - return; - } - - case nir_op_unpack_64_2x32_split_x: - { - unsigned chan = (instr->src[0].swizzle[0] * 2) + 0; - bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), chan)); - return; - } - - case nir_op_unpack_64_2x32_split_y: - { - unsigned chan = (instr->src[0].swizzle[0] * 2) + 1; - bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), chan)); - return; - } - - case nir_op_pack_64_2x32_split: - bi_collect_v2i32_to(b, dst, - bi_extract(b, bi_src_index(&instr->src[0].src), instr->src[0].swizzle[0]), - bi_extract(b, bi_src_index(&instr->src[1].src), instr->src[1].swizzle[0])); - return; - - case nir_op_pack_64_2x32: - bi_collect_v2i32_to(b, dst, - bi_extract(b, bi_src_index(&instr->src[0].src), 0), - bi_extract(b, bi_src_index(&instr->src[0].src), 1)); - return; - - case nir_op_pack_uvec2_to_uint: { - bi_index src = bi_src_index(&instr->src[0].src); - - assert(sz == 32 && src_sz == 32); - bi_mkvec_v2i16_to(b, dst, bi_half(bi_extract(b, src, 0), false), - bi_half(bi_extract(b, src, 1), false)); - return; - } - - case nir_op_pack_uvec4_to_uint: { - bi_index src = bi_src_index(&instr->src[0].src); - - assert(sz == 32 && src_sz == 32); - bi_mkvec_v4i8_to(b, dst, bi_byte(bi_extract(b, src, 0), 0), - bi_byte(bi_extract(b, src, 1), 0), - bi_byte(bi_extract(b, src, 2), 0), - bi_byte(bi_extract(b, src, 3), 0)); - return; - } - - case nir_op_mov: { - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; - - unsigned channels[4] = { - comps > 0 ? instr->src[0].swizzle[0] : 0, - comps > 1 ? instr->src[0].swizzle[1] : 0, - comps > 2 ? instr->src[0].swizzle[2] : 0, - comps > 3 ? instr->src[0].swizzle[3] : 0, - }; - - bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz); - return; - } - - case nir_op_pack_32_2x16: { - assert(comps == 1); - - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; - - unsigned channels[2] = { - instr->src[0].swizzle[0], - instr->src[0].swizzle[1] - }; - - bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16); - return; - } - - case nir_op_f2f16: - case nir_op_f2f16_rtz: - case nir_op_f2f16_rtne: { - assert(src_sz == 32); - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); - bi_index s1 = comps > 1 ? - bi_extract(b, idx, instr->src[0].swizzle[1]) : s0; - - bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1); - - /* Override rounding if explicitly requested. Otherwise, the - * default rounding mode is selected by the builder. Depending - * on the float controls required by the shader, the default - * mode may not be nearest-even. - */ - if (instr->op == nir_op_f2f16_rtz) - I->round = BI_ROUND_RTZ; - else if (instr->op == nir_op_f2f16_rtne) - I->round = BI_ROUND_NONE; /* Nearest even */ - - return; - } - - /* Vectorized downcasts */ - case nir_op_u2u16: - case nir_op_i2i16: { - if (!(src_sz == 32 && comps == 2)) - break; - - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); - bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]); - - bi_mkvec_v2i16_to(b, dst, - bi_half(s0, false), bi_half(s1, false)); - return; - } - - /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to - * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than - * scalarizing due to scheduling (equal cost on Valhall). Additionally - * if the source is replicated the MKVEC.v2i16 can be optimized out. - */ - case nir_op_u2f16: - case nir_op_i2f16: { - if (!(src_sz == 32 && comps == 2)) - break; - - nir_alu_src *src = &instr->src[0]; - bi_index idx = bi_src_index(&src->src); - bi_index s0 = bi_extract(b, idx, src->swizzle[0]); - bi_index s1 = bi_extract(b, idx, src->swizzle[1]); - - bi_index t = (src->swizzle[0] == src->swizzle[1]) ? - bi_half(s0, false) : - bi_mkvec_v2i16(b, bi_half(s0, false), - bi_half(s1, false)); - - if (instr->op == nir_op_u2f16) - bi_v2u16_to_v2f16_to(b, dst, t); - else - bi_v2s16_to_v2f16_to(b, dst, t); - - return; - } - - case nir_op_i2i8: - case nir_op_u2u8: - { - /* Acts like an 8-bit swizzle */ - bi_index idx = bi_src_index(&instr->src[0].src); - unsigned factor = src_sz / 8; - unsigned chan[4] = { 0 }; - - for (unsigned i = 0; i < comps; ++i) - chan[i] = instr->src[0].swizzle[i] * factor; - - bi_make_vec_to(b, dst, &idx, chan, comps, 8); - return; - } - - case nir_op_b32csel: - { - if (sz != 16) - break; - - /* We allow vectorizing b32csel(cond, A, B) which can be - * translated as MUX.v2i16, even though cond is a 32-bit vector. - * - * If the source condition vector is replicated, we can use - * MUX.v2i16 directly, letting each component use the - * corresponding half of the 32-bit source. NIR uses 0/~0 - * booleans so that's guaranteed to work (that is, 32-bit NIR - * booleans are 16-bit replicated). - * - * If we're not replicated, we use the same trick but must - * insert a MKVEC.v2i16 first to convert down to 16-bit. - */ - bi_index idx = bi_src_index(&instr->src[0].src); - bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); - bi_index s1 = bi_alu_src_index(b, instr->src[1], comps); - bi_index s2 = bi_alu_src_index(b, instr->src[2], comps); - - if (!bi_nir_is_replicated(&instr->src[0])) { - s0 = bi_mkvec_v2i16(b, bi_half(s0, false), - bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false)); - } - - bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); - return; - } - - default: - break; - } - - bi_index s0 = srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null(); - bi_index s1 = srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null(); - bi_index s2 = srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null(); - - switch (instr->op) { - case nir_op_ffma: - bi_fma_to(b, sz, dst, s0, s1, s2); - break; - - case nir_op_fmul: - bi_fma_to(b, sz, dst, s0, s1, bi_negzero()); - break; - - case nir_op_fsub: - s1 = bi_neg(s1); - FALLTHROUGH; - case nir_op_fadd: - bi_fadd_to(b, sz, dst, s0, s1); - break; - - case nir_op_fsat: { - bi_instr *I = bi_fclamp_to(b, sz, dst, s0); - I->clamp = BI_CLAMP_CLAMP_0_1; - break; - } - - case nir_op_fsat_signed_mali: { - bi_instr *I = bi_fclamp_to(b, sz, dst, s0); - I->clamp = BI_CLAMP_CLAMP_M1_1; - break; - } - - case nir_op_fclamp_pos_mali: { - bi_instr *I = bi_fclamp_to(b, sz, dst, s0); - I->clamp = BI_CLAMP_CLAMP_0_INF; - break; - } - - case nir_op_fneg: - bi_fabsneg_to(b, sz, dst, bi_neg(s0)); - break; - - case nir_op_fabs: - bi_fabsneg_to(b, sz, dst, bi_abs(s0)); - break; - - case nir_op_fsin: - bi_lower_fsincos_32(b, dst, s0, false); - break; - - case nir_op_fcos: - bi_lower_fsincos_32(b, dst, s0, true); - break; - - case nir_op_fexp2: - assert(sz == 32); /* should've been lowered */ - - if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_fexp2_32(b, dst, s0); - else - bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f)); - - break; - - case nir_op_flog2: - assert(sz == 32); /* should've been lowered */ - - if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_flog2_32(b, dst, s0); - else - bi_flog2_32(b, dst, s0); - - break; - - case nir_op_fpow: - assert(sz == 32); /* should've been lowered */ - - if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_fpow_32(b, dst, s0, s1); - else - bi_fpow_32(b, dst, s0, s1); - - break; - - case nir_op_frexp_exp: - bi_frexpe_to(b, sz, dst, s0, false, false); - break; - - case nir_op_frexp_sig: - bi_frexpm_to(b, sz, dst, s0, false, false); - break; - - case nir_op_ldexp: - bi_ldexp_to(b, sz, dst, s0, s1); - break; - - case nir_op_b8csel: - bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); - break; - - case nir_op_b16csel: - bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); - break; - - case nir_op_b32csel: - bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); - break; - - case nir_op_extract_u8: - case nir_op_extract_i8: { - assert(comps == 1 && "should be scalarized"); - assert((src_sz == 16 || src_sz == 32) && "should be lowered"); - unsigned byte = nir_src_as_uint(instr->src[1].src); - - if (s0.swizzle == BI_SWIZZLE_H11) { - assert(byte < 2); - byte += 2; - } else if (s0.swizzle != BI_SWIZZLE_H01) { - assert(s0.swizzle == BI_SWIZZLE_H00); - } - - assert(byte < 4); - - s0.swizzle = BI_SWIZZLE_H01; - - if (instr->op == nir_op_extract_i8) - bi_s8_to_s32_to(b, dst, bi_byte(s0, byte)); - else - bi_u8_to_u32_to(b, dst, bi_byte(s0, byte)); - break; - } - - case nir_op_extract_u16: - case nir_op_extract_i16: { - assert(comps == 1 && "should be scalarized"); - assert(src_sz == 32 && "should be lowered"); - unsigned half = nir_src_as_uint(instr->src[1].src); - assert(half == 0 || half == 1); - - if (instr->op == nir_op_extract_i16) - bi_s16_to_s32_to(b, dst, bi_half(s0, half)); - else - bi_u16_to_u32_to(b, dst, bi_half(s0, half)); - break; - } - - case nir_op_insert_u16: { - assert(comps == 1 && "should be scalarized"); - unsigned half = nir_src_as_uint(instr->src[1].src); - assert(half == 0 || half == 1); - - if (half == 0) - bi_u16_to_u32_to(b, dst, bi_half(s0, 0)); - else - bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0)); - break; - } - - case nir_op_ishl: - bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0)); - break; - case nir_op_ushr: - bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false); - break; - - case nir_op_ishr: - if (b->shader->arch >= 9) - bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true); - else - bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0)); - break; - - case nir_op_imin: - case nir_op_umin: - bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, - s0, s1, s0, s1, BI_CMPF_LT); - break; - - case nir_op_imax: - case nir_op_umax: - bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, - s0, s1, s0, s1, BI_CMPF_GT); - break; - - case nir_op_fddx_must_abs_mali: - case nir_op_fddy_must_abs_mali: { - bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2); - bi_index adjacent = bi_clper_xor(b, s0, bit); - bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0)); - break; - } - - case nir_op_fddx: - case nir_op_fddy: - case nir_op_fddx_coarse: - case nir_op_fddy_coarse: - case nir_op_fddx_fine: - case nir_op_fddy_fine: { - unsigned axis; - switch (instr->op) { - case nir_op_fddx: - case nir_op_fddx_coarse: - case nir_op_fddx_fine: - axis = 1; - break; - case nir_op_fddy: - case nir_op_fddy_coarse: - case nir_op_fddy_fine: - axis = 2; - break; - default: - unreachable("Invalid derivative op"); - } - - bi_index lane1, lane2; - switch (instr->op) { - case nir_op_fddx: - case nir_op_fddx_fine: - case nir_op_fddy: - case nir_op_fddy_fine: - lane1 = bi_lshift_and_i32(b, - bi_fau(BIR_FAU_LANE_ID, false), - bi_imm_u32(0x3 & ~axis), - bi_imm_u8(0)); - - lane2 = bi_iadd_u32(b, lane1, - bi_imm_u32(axis), - false); - break; - case nir_op_fddx_coarse: - case nir_op_fddy_coarse: - lane1 = bi_imm_u32(0); - lane2 = bi_imm_u32(axis); - break; - default: - unreachable("Invalid derivative op"); - } - - bi_index left, right; - - if (b->shader->quirks & BIFROST_LIMITED_CLPER) { - left = bi_clper_old_i32(b, s0, lane1); - right = bi_clper_old_i32(b, s0, lane2); - } else { - left = bi_clper_i32(b, s0, lane1, - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP4); - - right = bi_clper_i32(b, s0, lane2, - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP4); - } - - bi_fadd_to(b, sz, dst, right, bi_neg(left)); - break; - } - - case nir_op_f2f32: - bi_f16_to_f32_to(b, dst, s0); - break; - - case nir_op_fquantize2f16: - { - bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0); - bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false)); - - f16->ftz = f32->ftz = true; - break; - } - - case nir_op_f2i32: - if (src_sz == 32) - bi_f32_to_s32_to(b, dst, s0); - else - bi_f16_to_s32_to(b, dst, s0); - break; - - /* Note 32-bit sources => no vectorization, so 32-bit works */ - case nir_op_f2u16: - if (src_sz == 32) - bi_f32_to_u32_to(b, dst, s0); - else - bi_v2f16_to_v2u16_to(b, dst, s0); - break; - - case nir_op_f2i16: - if (src_sz == 32) - bi_f32_to_s32_to(b, dst, s0); - else - bi_v2f16_to_v2s16_to(b, dst, s0); - break; - - case nir_op_f2u32: - if (src_sz == 32) - bi_f32_to_u32_to(b, dst, s0); - else - bi_f16_to_u32_to(b, dst, s0); - break; - - case nir_op_u2f16: - if (src_sz == 32) - bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false)); - else if (src_sz == 16) - bi_v2u16_to_v2f16_to(b, dst, s0); - else if (src_sz == 8) - bi_v2u8_to_v2f16_to(b, dst, s0); - break; - - case nir_op_u2f32: - if (src_sz == 32) - bi_u32_to_f32_to(b, dst, s0); - else if (src_sz == 16) - bi_u16_to_f32_to(b, dst, s0); - else - bi_u8_to_f32_to(b, dst, s0); - break; - - case nir_op_i2f16: - if (src_sz == 32) - bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false)); - else if (src_sz == 16) - bi_v2s16_to_v2f16_to(b, dst, s0); - else if (src_sz == 8) - bi_v2s8_to_v2f16_to(b, dst, s0); - break; - - case nir_op_i2f32: - assert(src_sz == 32 || src_sz == 16 || src_sz == 8); - - if (src_sz == 32) - bi_s32_to_f32_to(b, dst, s0); - else if (src_sz == 16) - bi_s16_to_f32_to(b, dst, s0); - else if (src_sz == 8) - bi_s8_to_f32_to(b, dst, s0); - break; - - case nir_op_i2i32: - assert(src_sz == 32 || src_sz == 16 || src_sz == 8); - - if (src_sz == 32) - bi_mov_i32_to(b, dst, s0); - else if (src_sz == 16) - bi_s16_to_s32_to(b, dst, s0); - else if (src_sz == 8) - bi_s8_to_s32_to(b, dst, s0); - break; - - case nir_op_u2u32: - assert(src_sz == 32 || src_sz == 16 || src_sz == 8); - - if (src_sz == 32) - bi_mov_i32_to(b, dst, s0); - else if (src_sz == 16) - bi_u16_to_u32_to(b, dst, s0); - else if (src_sz == 8) - bi_u8_to_u32_to(b, dst, s0); - - break; - - case nir_op_i2i16: - assert(src_sz == 8 || src_sz == 32); - - if (src_sz == 8) - bi_v2s8_to_v2s16_to(b, dst, s0); - else - bi_mov_i32_to(b, dst, s0); - break; - - case nir_op_u2u16: - assert(src_sz == 8 || src_sz == 32); - - if (src_sz == 8) - bi_v2u8_to_v2u16_to(b, dst, s0); - else - bi_mov_i32_to(b, dst, s0); - break; - - case nir_op_b2i8: - case nir_op_b2i16: - case nir_op_b2i32: - bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0, BI_MUX_INT_ZERO); - break; - - case nir_op_f2b16: - bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_FP_ZERO); - break; - case nir_op_f2b32: - bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_FP_ZERO); - break; - - case nir_op_ieq8: - case nir_op_ine8: - case nir_op_ilt8: - case nir_op_ige8: - case nir_op_ieq16: - case nir_op_ine16: - case nir_op_ilt16: - case nir_op_ige16: - case nir_op_ieq32: - case nir_op_ine32: - case nir_op_ilt32: - case nir_op_ige32: - bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); - break; - - case nir_op_ult8: - case nir_op_uge8: - case nir_op_ult16: - case nir_op_uge16: - case nir_op_ult32: - case nir_op_uge32: - bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); - break; - - case nir_op_feq32: - case nir_op_feq16: - case nir_op_flt32: - case nir_op_flt16: - case nir_op_fge32: - case nir_op_fge16: - case nir_op_fneu32: - case nir_op_fneu16: - bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); - break; - - case nir_op_fround_even: - case nir_op_fceil: - case nir_op_ffloor: - case nir_op_ftrunc: - bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op)); - break; - - case nir_op_fmin: - bi_fmin_to(b, sz, dst, s0, s1); - break; - - case nir_op_fmax: - bi_fmax_to(b, sz, dst, s0, s1); - break; - - case nir_op_iadd: - bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false); - break; - - case nir_op_iadd_sat: - bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true); - break; - - case nir_op_uadd_sat: - bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true); - break; - - case nir_op_ihadd: - bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN); - break; - - case nir_op_irhadd: - bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP); - break; - - case nir_op_uhadd: - bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN); - break; - - case nir_op_urhadd: - bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP); - break; - - case nir_op_ineg: - bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false); - break; - - case nir_op_isub: - bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false); - break; - - case nir_op_isub_sat: - bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true); - break; - - case nir_op_usub_sat: - bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true); - break; - - case nir_op_imul: - bi_imul_to(b, sz, dst, s0, s1); - break; - - case nir_op_iabs: - bi_iabs_to(b, sz, dst, s0); - break; - - case nir_op_iand: - bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0)); - break; - - case nir_op_ior: - bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0)); - break; - - case nir_op_ixor: - bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0)); - break; - - case nir_op_inot: - bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0)); - break; - - case nir_op_frsq: - if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_frsq_32(b, dst, s0); - else - bi_frsq_to(b, sz, dst, s0); - break; - - case nir_op_frcp: - if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) - bi_lower_frcp_32(b, dst, s0); - else - bi_frcp_to(b, sz, dst, s0); - break; - - case nir_op_uclz: - bi_clz_to(b, sz, dst, s0, false); - break; - - case nir_op_bit_count: - assert(sz == 32 && src_sz == 32 && "should've been lowered"); - bi_popcount_i32_to(b, dst, s0); - break; - - case nir_op_bitfield_reverse: - assert(sz == 32 && src_sz == 32 && "should've been lowered"); - bi_bitrev_i32_to(b, dst, s0); - break; - - case nir_op_ufind_msb: { - bi_index clz = bi_clz(b, src_sz, s0, false); - - if (sz == 8) - clz = bi_byte(clz, 0); - else if (sz == 16) - clz = bi_half(clz, false); - - bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false); - break; - } - - default: - fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); - unreachable("Unknown ALU op"); - } + bi_index dst = bi_dest_index(&instr->dest.dest); + unsigned srcs = nir_op_infos[instr->op].num_inputs; + unsigned sz = nir_dest_bit_size(instr->dest.dest); + unsigned comps = nir_dest_num_components(instr->dest.dest); + unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0; + + /* Indicate scalarness */ + if (sz == 16 && comps == 1) + dst.swizzle = BI_SWIZZLE_H00; + + /* First, match against the various moves in NIR. These are + * special-cased because they can operate on vectors even after + * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the + * instruction is no "bigger" than SIMD-within-a-register. These moves + * are the exceptions that need to handle swizzles specially. */ + + switch (instr->op) { + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: { + bi_index unoffset_srcs[16] = {bi_null()}; + unsigned channels[16] = {0}; + + for (unsigned i = 0; i < srcs; ++i) { + unoffset_srcs[i] = bi_src_index(&instr->src[i].src); + channels[i] = instr->src[i].swizzle[0]; + } + + bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz); + return; + } + + case nir_op_unpack_32_2x16: { + /* Should have been scalarized */ + assert(comps == 2 && sz == 16); + + bi_index vec = bi_src_index(&instr->src[0].src); + unsigned chan = instr->src[0].swizzle[0]; + + bi_mov_i32_to(b, dst, bi_extract(b, vec, chan)); + return; + } + + case nir_op_unpack_64_2x32_split_x: { + unsigned chan = (instr->src[0].swizzle[0] * 2) + 0; + bi_mov_i32_to(b, dst, + bi_extract(b, bi_src_index(&instr->src[0].src), chan)); + return; + } + + case nir_op_unpack_64_2x32_split_y: { + unsigned chan = (instr->src[0].swizzle[0] * 2) + 1; + bi_mov_i32_to(b, dst, + bi_extract(b, bi_src_index(&instr->src[0].src), chan)); + return; + } + + case nir_op_pack_64_2x32_split: + bi_collect_v2i32_to(b, dst, + bi_extract(b, bi_src_index(&instr->src[0].src), + instr->src[0].swizzle[0]), + bi_extract(b, bi_src_index(&instr->src[1].src), + instr->src[1].swizzle[0])); + return; + + case nir_op_pack_64_2x32: + bi_collect_v2i32_to(b, dst, + bi_extract(b, bi_src_index(&instr->src[0].src), 0), + bi_extract(b, bi_src_index(&instr->src[0].src), 1)); + return; + + case nir_op_pack_uvec2_to_uint: { + bi_index src = bi_src_index(&instr->src[0].src); + + assert(sz == 32 && src_sz == 32); + bi_mkvec_v2i16_to(b, dst, bi_half(bi_extract(b, src, 0), false), + bi_half(bi_extract(b, src, 1), false)); + return; + } + + case nir_op_pack_uvec4_to_uint: { + bi_index src = bi_src_index(&instr->src[0].src); + + assert(sz == 32 && src_sz == 32); + bi_mkvec_v4i8_to(b, dst, bi_byte(bi_extract(b, src, 0), 0), + bi_byte(bi_extract(b, src, 1), 0), + bi_byte(bi_extract(b, src, 2), 0), + bi_byte(bi_extract(b, src, 3), 0)); + return; + } + + case nir_op_mov: { + bi_index idx = bi_src_index(&instr->src[0].src); + bi_index unoffset_srcs[4] = {idx, idx, idx, idx}; + + unsigned channels[4] = { + comps > 0 ? instr->src[0].swizzle[0] : 0, + comps > 1 ? instr->src[0].swizzle[1] : 0, + comps > 2 ? instr->src[0].swizzle[2] : 0, + comps > 3 ? instr->src[0].swizzle[3] : 0, + }; + + bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz); + return; + } + + case nir_op_pack_32_2x16: { + assert(comps == 1); + + bi_index idx = bi_src_index(&instr->src[0].src); + bi_index unoffset_srcs[4] = {idx, idx, idx, idx}; + + unsigned channels[2] = {instr->src[0].swizzle[0], + instr->src[0].swizzle[1]}; + + bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16); + return; + } + + case nir_op_f2f16: + case nir_op_f2f16_rtz: + case nir_op_f2f16_rtne: { + assert(src_sz == 32); + bi_index idx = bi_src_index(&instr->src[0].src); + bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); + bi_index s1 = + comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0; + + bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1); + + /* Override rounding if explicitly requested. Otherwise, the + * default rounding mode is selected by the builder. Depending + * on the float controls required by the shader, the default + * mode may not be nearest-even. + */ + if (instr->op == nir_op_f2f16_rtz) + I->round = BI_ROUND_RTZ; + else if (instr->op == nir_op_f2f16_rtne) + I->round = BI_ROUND_NONE; /* Nearest even */ + + return; + } + + /* Vectorized downcasts */ + case nir_op_u2u16: + case nir_op_i2i16: { + if (!(src_sz == 32 && comps == 2)) + break; + + bi_index idx = bi_src_index(&instr->src[0].src); + bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); + bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]); + + bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_half(s1, false)); + return; + } + + /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to + * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than + * scalarizing due to scheduling (equal cost on Valhall). Additionally + * if the source is replicated the MKVEC.v2i16 can be optimized out. + */ + case nir_op_u2f16: + case nir_op_i2f16: { + if (!(src_sz == 32 && comps == 2)) + break; + + nir_alu_src *src = &instr->src[0]; + bi_index idx = bi_src_index(&src->src); + bi_index s0 = bi_extract(b, idx, src->swizzle[0]); + bi_index s1 = bi_extract(b, idx, src->swizzle[1]); + + bi_index t = + (src->swizzle[0] == src->swizzle[1]) + ? bi_half(s0, false) + : bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(s1, false)); + + if (instr->op == nir_op_u2f16) + bi_v2u16_to_v2f16_to(b, dst, t); + else + bi_v2s16_to_v2f16_to(b, dst, t); + + return; + } + + case nir_op_i2i8: + case nir_op_u2u8: { + /* Acts like an 8-bit swizzle */ + bi_index idx = bi_src_index(&instr->src[0].src); + unsigned factor = src_sz / 8; + unsigned chan[4] = {0}; + + for (unsigned i = 0; i < comps; ++i) + chan[i] = instr->src[0].swizzle[i] * factor; + + bi_make_vec_to(b, dst, &idx, chan, comps, 8); + return; + } + + case nir_op_b32csel: { + if (sz != 16) + break; + + /* We allow vectorizing b32csel(cond, A, B) which can be + * translated as MUX.v2i16, even though cond is a 32-bit vector. + * + * If the source condition vector is replicated, we can use + * MUX.v2i16 directly, letting each component use the + * corresponding half of the 32-bit source. NIR uses 0/~0 + * booleans so that's guaranteed to work (that is, 32-bit NIR + * booleans are 16-bit replicated). + * + * If we're not replicated, we use the same trick but must + * insert a MKVEC.v2i16 first to convert down to 16-bit. + */ + bi_index idx = bi_src_index(&instr->src[0].src); + bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); + bi_index s1 = bi_alu_src_index(b, instr->src[1], comps); + bi_index s2 = bi_alu_src_index(b, instr->src[2], comps); + + if (!bi_nir_is_replicated(&instr->src[0])) { + s0 = bi_mkvec_v2i16( + b, bi_half(s0, false), + bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false)); + } + + bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); + return; + } + + default: + break; + } + + bi_index s0 = + srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null(); + bi_index s1 = + srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null(); + bi_index s2 = + srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null(); + + switch (instr->op) { + case nir_op_ffma: + bi_fma_to(b, sz, dst, s0, s1, s2); + break; + + case nir_op_fmul: + bi_fma_to(b, sz, dst, s0, s1, bi_negzero()); + break; + + case nir_op_fsub: + s1 = bi_neg(s1); + FALLTHROUGH; + case nir_op_fadd: + bi_fadd_to(b, sz, dst, s0, s1); + break; + + case nir_op_fsat: { + bi_instr *I = bi_fclamp_to(b, sz, dst, s0); + I->clamp = BI_CLAMP_CLAMP_0_1; + break; + } + + case nir_op_fsat_signed_mali: { + bi_instr *I = bi_fclamp_to(b, sz, dst, s0); + I->clamp = BI_CLAMP_CLAMP_M1_1; + break; + } + + case nir_op_fclamp_pos_mali: { + bi_instr *I = bi_fclamp_to(b, sz, dst, s0); + I->clamp = BI_CLAMP_CLAMP_0_INF; + break; + } + + case nir_op_fneg: + bi_fabsneg_to(b, sz, dst, bi_neg(s0)); + break; + + case nir_op_fabs: + bi_fabsneg_to(b, sz, dst, bi_abs(s0)); + break; + + case nir_op_fsin: + bi_lower_fsincos_32(b, dst, s0, false); + break; + + case nir_op_fcos: + bi_lower_fsincos_32(b, dst, s0, true); + break; + + case nir_op_fexp2: + assert(sz == 32); /* should've been lowered */ + + if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) + bi_lower_fexp2_32(b, dst, s0); + else + bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f)); + + break; + + case nir_op_flog2: + assert(sz == 32); /* should've been lowered */ + + if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) + bi_lower_flog2_32(b, dst, s0); + else + bi_flog2_32(b, dst, s0); + + break; + + case nir_op_fpow: + assert(sz == 32); /* should've been lowered */ + + if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) + bi_lower_fpow_32(b, dst, s0, s1); + else + bi_fpow_32(b, dst, s0, s1); + + break; + + case nir_op_frexp_exp: + bi_frexpe_to(b, sz, dst, s0, false, false); + break; + + case nir_op_frexp_sig: + bi_frexpm_to(b, sz, dst, s0, false, false); + break; + + case nir_op_ldexp: + bi_ldexp_to(b, sz, dst, s0, s1); + break; + + case nir_op_b8csel: + bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); + break; + + case nir_op_b16csel: + bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); + break; + + case nir_op_b32csel: + bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); + break; + + case nir_op_extract_u8: + case nir_op_extract_i8: { + assert(comps == 1 && "should be scalarized"); + assert((src_sz == 16 || src_sz == 32) && "should be lowered"); + unsigned byte = nir_src_as_uint(instr->src[1].src); + + if (s0.swizzle == BI_SWIZZLE_H11) { + assert(byte < 2); + byte += 2; + } else if (s0.swizzle != BI_SWIZZLE_H01) { + assert(s0.swizzle == BI_SWIZZLE_H00); + } + + assert(byte < 4); + + s0.swizzle = BI_SWIZZLE_H01; + + if (instr->op == nir_op_extract_i8) + bi_s8_to_s32_to(b, dst, bi_byte(s0, byte)); + else + bi_u8_to_u32_to(b, dst, bi_byte(s0, byte)); + break; + } + + case nir_op_extract_u16: + case nir_op_extract_i16: { + assert(comps == 1 && "should be scalarized"); + assert(src_sz == 32 && "should be lowered"); + unsigned half = nir_src_as_uint(instr->src[1].src); + assert(half == 0 || half == 1); + + if (instr->op == nir_op_extract_i16) + bi_s16_to_s32_to(b, dst, bi_half(s0, half)); + else + bi_u16_to_u32_to(b, dst, bi_half(s0, half)); + break; + } + + case nir_op_insert_u16: { + assert(comps == 1 && "should be scalarized"); + unsigned half = nir_src_as_uint(instr->src[1].src); + assert(half == 0 || half == 1); + + if (half == 0) + bi_u16_to_u32_to(b, dst, bi_half(s0, 0)); + else + bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0)); + break; + } + + case nir_op_ishl: + bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0)); + break; + case nir_op_ushr: + bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false); + break; + + case nir_op_ishr: + if (b->shader->arch >= 9) + bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true); + else + bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0)); + break; + + case nir_op_imin: + case nir_op_umin: + bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0, + s1, BI_CMPF_LT); + break; + + case nir_op_imax: + case nir_op_umax: + bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0, + s1, BI_CMPF_GT); + break; + + case nir_op_fddx_must_abs_mali: + case nir_op_fddy_must_abs_mali: { + bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2); + bi_index adjacent = bi_clper_xor(b, s0, bit); + bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0)); + break; + } + + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + case nir_op_fddx_fine: + case nir_op_fddy_fine: { + unsigned axis; + switch (instr->op) { + case nir_op_fddx: + case nir_op_fddx_coarse: + case nir_op_fddx_fine: + axis = 1; + break; + case nir_op_fddy: + case nir_op_fddy_coarse: + case nir_op_fddy_fine: + axis = 2; + break; + default: + unreachable("Invalid derivative op"); + } + + bi_index lane1, lane2; + switch (instr->op) { + case nir_op_fddx: + case nir_op_fddx_fine: + case nir_op_fddy: + case nir_op_fddy_fine: + lane1 = bi_lshift_and_i32(b, bi_fau(BIR_FAU_LANE_ID, false), + bi_imm_u32(0x3 & ~axis), bi_imm_u8(0)); + + lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false); + break; + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + lane1 = bi_imm_u32(0); + lane2 = bi_imm_u32(axis); + break; + default: + unreachable("Invalid derivative op"); + } + + bi_index left, right; + + if (b->shader->quirks & BIFROST_LIMITED_CLPER) { + left = bi_clper_old_i32(b, s0, lane1); + right = bi_clper_old_i32(b, s0, lane2); + } else { + left = bi_clper_i32(b, s0, lane1, BI_INACTIVE_RESULT_ZERO, + BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP4); + + right = bi_clper_i32(b, s0, lane2, BI_INACTIVE_RESULT_ZERO, + BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP4); + } + + bi_fadd_to(b, sz, dst, right, bi_neg(left)); + break; + } + + case nir_op_f2f32: + bi_f16_to_f32_to(b, dst, s0); + break; + + case nir_op_fquantize2f16: { + bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0); + bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false)); + + f16->ftz = f32->ftz = true; + break; + } + + case nir_op_f2i32: + if (src_sz == 32) + bi_f32_to_s32_to(b, dst, s0); + else + bi_f16_to_s32_to(b, dst, s0); + break; + + /* Note 32-bit sources => no vectorization, so 32-bit works */ + case nir_op_f2u16: + if (src_sz == 32) + bi_f32_to_u32_to(b, dst, s0); + else + bi_v2f16_to_v2u16_to(b, dst, s0); + break; + + case nir_op_f2i16: + if (src_sz == 32) + bi_f32_to_s32_to(b, dst, s0); + else + bi_v2f16_to_v2s16_to(b, dst, s0); + break; + + case nir_op_f2u32: + if (src_sz == 32) + bi_f32_to_u32_to(b, dst, s0); + else + bi_f16_to_u32_to(b, dst, s0); + break; + + case nir_op_u2f16: + if (src_sz == 32) + bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false)); + else if (src_sz == 16) + bi_v2u16_to_v2f16_to(b, dst, s0); + else if (src_sz == 8) + bi_v2u8_to_v2f16_to(b, dst, s0); + break; + + case nir_op_u2f32: + if (src_sz == 32) + bi_u32_to_f32_to(b, dst, s0); + else if (src_sz == 16) + bi_u16_to_f32_to(b, dst, s0); + else + bi_u8_to_f32_to(b, dst, s0); + break; + + case nir_op_i2f16: + if (src_sz == 32) + bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false)); + else if (src_sz == 16) + bi_v2s16_to_v2f16_to(b, dst, s0); + else if (src_sz == 8) + bi_v2s8_to_v2f16_to(b, dst, s0); + break; + + case nir_op_i2f32: + assert(src_sz == 32 || src_sz == 16 || src_sz == 8); + + if (src_sz == 32) + bi_s32_to_f32_to(b, dst, s0); + else if (src_sz == 16) + bi_s16_to_f32_to(b, dst, s0); + else if (src_sz == 8) + bi_s8_to_f32_to(b, dst, s0); + break; + + case nir_op_i2i32: + assert(src_sz == 32 || src_sz == 16 || src_sz == 8); + + if (src_sz == 32) + bi_mov_i32_to(b, dst, s0); + else if (src_sz == 16) + bi_s16_to_s32_to(b, dst, s0); + else if (src_sz == 8) + bi_s8_to_s32_to(b, dst, s0); + break; + + case nir_op_u2u32: + assert(src_sz == 32 || src_sz == 16 || src_sz == 8); + + if (src_sz == 32) + bi_mov_i32_to(b, dst, s0); + else if (src_sz == 16) + bi_u16_to_u32_to(b, dst, s0); + else if (src_sz == 8) + bi_u8_to_u32_to(b, dst, s0); + + break; + + case nir_op_i2i16: + assert(src_sz == 8 || src_sz == 32); + + if (src_sz == 8) + bi_v2s8_to_v2s16_to(b, dst, s0); + else + bi_mov_i32_to(b, dst, s0); + break; + + case nir_op_u2u16: + assert(src_sz == 8 || src_sz == 32); + + if (src_sz == 8) + bi_v2u8_to_v2u16_to(b, dst, s0); + else + bi_mov_i32_to(b, dst, s0); + break; + + case nir_op_b2i8: + case nir_op_b2i16: + case nir_op_b2i32: + bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0, + BI_MUX_INT_ZERO); + break; + + case nir_op_f2b16: + bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, + BI_MUX_FP_ZERO); + break; + case nir_op_f2b32: + bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_FP_ZERO); + break; + + case nir_op_ieq8: + case nir_op_ine8: + case nir_op_ilt8: + case nir_op_ige8: + case nir_op_ieq16: + case nir_op_ine16: + case nir_op_ilt16: + case nir_op_ige16: + case nir_op_ieq32: + case nir_op_ine32: + case nir_op_ilt32: + case nir_op_ige32: + bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op), + BI_RESULT_TYPE_M1); + break; + + case nir_op_ult8: + case nir_op_uge8: + case nir_op_ult16: + case nir_op_uge16: + case nir_op_ult32: + case nir_op_uge32: + bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, + bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); + break; + + case nir_op_feq32: + case nir_op_feq16: + case nir_op_flt32: + case nir_op_flt16: + case nir_op_fge32: + case nir_op_fge16: + case nir_op_fneu32: + case nir_op_fneu16: + bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op), + BI_RESULT_TYPE_M1); + break; + + case nir_op_fround_even: + case nir_op_fceil: + case nir_op_ffloor: + case nir_op_ftrunc: + bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op)); + break; + + case nir_op_fmin: + bi_fmin_to(b, sz, dst, s0, s1); + break; + + case nir_op_fmax: + bi_fmax_to(b, sz, dst, s0, s1); + break; + + case nir_op_iadd: + bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false); + break; + + case nir_op_iadd_sat: + bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true); + break; + + case nir_op_uadd_sat: + bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true); + break; + + case nir_op_ihadd: + bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN); + break; + + case nir_op_irhadd: + bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP); + break; + + case nir_op_uhadd: + bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN); + break; + + case nir_op_urhadd: + bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP); + break; + + case nir_op_ineg: + bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false); + break; + + case nir_op_isub: + bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false); + break; + + case nir_op_isub_sat: + bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true); + break; + + case nir_op_usub_sat: + bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true); + break; + + case nir_op_imul: + bi_imul_to(b, sz, dst, s0, s1); + break; + + case nir_op_iabs: + bi_iabs_to(b, sz, dst, s0); + break; + + case nir_op_iand: + bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0)); + break; + + case nir_op_ior: + bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0)); + break; + + case nir_op_ixor: + bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0)); + break; + + case nir_op_inot: + bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0)); + break; + + case nir_op_frsq: + if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) + bi_lower_frsq_32(b, dst, s0); + else + bi_frsq_to(b, sz, dst, s0); + break; + + case nir_op_frcp: + if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) + bi_lower_frcp_32(b, dst, s0); + else + bi_frcp_to(b, sz, dst, s0); + break; + + case nir_op_uclz: + bi_clz_to(b, sz, dst, s0, false); + break; + + case nir_op_bit_count: + assert(sz == 32 && src_sz == 32 && "should've been lowered"); + bi_popcount_i32_to(b, dst, s0); + break; + + case nir_op_bitfield_reverse: + assert(sz == 32 && src_sz == 32 && "should've been lowered"); + bi_bitrev_i32_to(b, dst, s0); + break; + + case nir_op_ufind_msb: { + bi_index clz = bi_clz(b, src_sz, s0, false); + + if (sz == 8) + clz = bi_byte(clz, 0); + else if (sz == 16) + clz = bi_half(clz, false); + + bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false); + break; + } + + default: + fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); + unreachable("Unknown ALU op"); + } } -/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */ +/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from + * Midgard */ static unsigned bifrost_tex_format(enum glsl_sampler_dim dim) { - switch (dim) { - case GLSL_SAMPLER_DIM_1D: - case GLSL_SAMPLER_DIM_BUF: - return 1; + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_BUF: + return 1; - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_MS: - case GLSL_SAMPLER_DIM_EXTERNAL: - case GLSL_SAMPLER_DIM_RECT: - return 2; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_MS: + case GLSL_SAMPLER_DIM_EXTERNAL: + case GLSL_SAMPLER_DIM_RECT: + return 2; - case GLSL_SAMPLER_DIM_3D: - return 3; + case GLSL_SAMPLER_DIM_3D: + return 3; - case GLSL_SAMPLER_DIM_CUBE: - return 0; + case GLSL_SAMPLER_DIM_CUBE: + return 0; - default: - DBG("Unknown sampler dim type\n"); - assert(0); - return 0; - } + default: + DBG("Unknown sampler dim type\n"); + assert(0); + return 0; + } } static enum bi_dimension valhall_tex_dimension(enum glsl_sampler_dim dim) { - switch (dim) { - case GLSL_SAMPLER_DIM_1D: - case GLSL_SAMPLER_DIM_BUF: - return BI_DIMENSION_1D; + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_BUF: + return BI_DIMENSION_1D; - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_MS: - case GLSL_SAMPLER_DIM_EXTERNAL: - case GLSL_SAMPLER_DIM_RECT: - return BI_DIMENSION_2D; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_MS: + case GLSL_SAMPLER_DIM_EXTERNAL: + case GLSL_SAMPLER_DIM_RECT: + return BI_DIMENSION_2D; - case GLSL_SAMPLER_DIM_3D: - return BI_DIMENSION_3D; + case GLSL_SAMPLER_DIM_3D: + return BI_DIMENSION_3D; - case GLSL_SAMPLER_DIM_CUBE: - return BI_DIMENSION_CUBE; + case GLSL_SAMPLER_DIM_CUBE: + return BI_DIMENSION_CUBE; - default: - unreachable("Unknown sampler dim type"); - } + default: + unreachable("Unknown sampler dim type"); + } } static enum bifrost_texture_format_full bi_texture_format(nir_alu_type T, enum bi_clamp clamp) { - switch (T) { - case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp; - case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp; - case nir_type_uint16: return BIFROST_TEXTURE_FORMAT_U16; - case nir_type_int16: return BIFROST_TEXTURE_FORMAT_S16; - case nir_type_uint32: return BIFROST_TEXTURE_FORMAT_U32; - case nir_type_int32: return BIFROST_TEXTURE_FORMAT_S32; - default: unreachable("Invalid type for texturing"); - } + switch (T) { + case nir_type_float16: + return BIFROST_TEXTURE_FORMAT_F16 + clamp; + case nir_type_float32: + return BIFROST_TEXTURE_FORMAT_F32 + clamp; + case nir_type_uint16: + return BIFROST_TEXTURE_FORMAT_U16; + case nir_type_int16: + return BIFROST_TEXTURE_FORMAT_S16; + case nir_type_uint32: + return BIFROST_TEXTURE_FORMAT_U32; + case nir_type_int32: + return BIFROST_TEXTURE_FORMAT_S32; + default: + unreachable("Invalid type for texturing"); + } } -/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */ +/* Array indices are specified as 32-bit uints, need to convert. In .z component + * from NIR */ static bi_index bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T) { - /* For (u)int we can just passthrough */ - nir_alu_type base = nir_alu_type_get_base_type(T); - if (base == nir_type_int || base == nir_type_uint) - return idx; + /* For (u)int we can just passthrough */ + nir_alu_type base = nir_alu_type_get_base_type(T); + if (base == nir_type_int || base == nir_type_uint) + return idx; - /* Otherwise we convert */ - assert(T == nir_type_float32); + /* Otherwise we convert */ + assert(T == nir_type_float32); - /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and - * Texel Selection") defines the layer to be taken from clamp(RNE(r), - * 0, dt - 1). So we use round RTE, clamping is handled at the data - * structure level */ + /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and + * Texel Selection") defines the layer to be taken from clamp(RNE(r), + * 0, dt - 1). So we use round RTE, clamping is handled at the data + * structure level */ - bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx); - I->round = BI_ROUND_NONE; - return I->dest[0]; + bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx); + I->round = BI_ROUND_NONE; + return I->dest[0]; } /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a @@ -3179,30 +3163,30 @@ bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T) static bi_index bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16) { - /* Precompute for constant LODs to avoid general constant folding */ - if (lod.type == BI_INDEX_CONSTANT) { - uint32_t raw = lod.value; - float x = fp16 ? _mesa_half_to_float(raw) : uif(raw); - int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f; - return bi_imm_u32(s32 & 0xFFFF); - } + /* Precompute for constant LODs to avoid general constant folding */ + if (lod.type == BI_INDEX_CONSTANT) { + uint32_t raw = lod.value; + float x = fp16 ? _mesa_half_to_float(raw) : uif(raw); + int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f; + return bi_imm_u32(s32 & 0xFFFF); + } - /* Sort of arbitrary. Must be less than 128.0, greater than or equal to - * the max LOD (16 since we cap at 2^16 texture dimensions), and - * preferably small to minimize precision loss */ - const float max_lod = 16.0; + /* Sort of arbitrary. Must be less than 128.0, greater than or equal to + * the max LOD (16 since we cap at 2^16 texture dimensions), and + * preferably small to minimize precision loss */ + const float max_lod = 16.0; - bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader), - fp16 ? bi_half(lod, false) : lod, - bi_imm_f32(1.0f / max_lod), bi_negzero()); + bi_instr *fsat = + bi_fma_f32_to(b, bi_temp(b->shader), fp16 ? bi_half(lod, false) : lod, + bi_imm_f32(1.0f / max_lod), bi_negzero()); - fsat->clamp = BI_CLAMP_CLAMP_M1_1; + fsat->clamp = BI_CLAMP_CLAMP_M1_1; - bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), - bi_negzero()); + bi_index fmul = + bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), bi_negzero()); - return bi_mkvec_v2i16(b, - bi_half(bi_f32_to_s32(b, fmul), false), bi_imm_u16(0)); + return bi_mkvec_v2i16(b, bi_half(bi_f32_to_s32(b, fmul), false), + bi_imm_u16(0)); } /* FETCH takes a 32-bit staging register containing the LOD as an integer in @@ -3213,7 +3197,7 @@ bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16) static bi_index bi_emit_texc_lod_cube(bi_builder *b, bi_index lod) { - return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8)); + return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8)); } /* The hardware specifies texel offsets and multisample indices together as a @@ -3225,31 +3209,28 @@ bi_emit_texc_lod_cube(bi_builder *b, bi_index lod) static bi_index bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr) { - bi_index dest = bi_zero(); + bi_index dest = bi_zero(); - int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); - if (offs_idx >= 0 && - (!nir_src_is_const(instr->src[offs_idx].src) || - nir_src_as_uint(instr->src[offs_idx].src) != 0)) { - unsigned nr = nir_src_num_components(instr->src[offs_idx].src); - bi_index idx = bi_src_index(&instr->src[offs_idx].src); - dest = bi_mkvec_v4i8(b, - (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), - (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), - (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0), - bi_imm_u8(0)); - } + int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); + if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) || + nir_src_as_uint(instr->src[offs_idx].src) != 0)) { + unsigned nr = nir_src_num_components(instr->src[offs_idx].src); + bi_index idx = bi_src_index(&instr->src[offs_idx].src); + dest = bi_mkvec_v4i8( + b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), + (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), + (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0), + bi_imm_u8(0)); + } - int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); - if (ms_idx >= 0 && - (!nir_src_is_const(instr->src[ms_idx].src) || - nir_src_as_uint(instr->src[ms_idx].src) != 0)) { - dest = bi_lshift_or_i32(b, - bi_src_index(&instr->src[ms_idx].src), dest, - bi_imm_u8(24)); - } + int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); + if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) || + nir_src_as_uint(instr->src[ms_idx].src) != 0)) { + dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[ms_idx].src), dest, + bi_imm_u8(24)); + } - return dest; + return dest; } /* @@ -3261,107 +3242,102 @@ bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr) static bi_index bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr) { - bi_index dest = bi_zero(); + bi_index dest = bi_zero(); - int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); - int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); - int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); + int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); + int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); + int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); - /* Components 0-2: offsets */ - if (offs_idx >= 0 && - (!nir_src_is_const(instr->src[offs_idx].src) || - nir_src_as_uint(instr->src[offs_idx].src) != 0)) { - unsigned nr = nir_src_num_components(instr->src[offs_idx].src); - bi_index idx = bi_src_index(&instr->src[offs_idx].src); + /* Components 0-2: offsets */ + if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) || + nir_src_as_uint(instr->src[offs_idx].src) != 0)) { + unsigned nr = nir_src_num_components(instr->src[offs_idx].src); + bi_index idx = bi_src_index(&instr->src[offs_idx].src); - /* No multisample index with 3D */ - assert((nr <= 2) || (ms_idx < 0)); + /* No multisample index with 3D */ + assert((nr <= 2) || (ms_idx < 0)); - /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */ - bi_index z = (nr > 2) ? - bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0), - bi_imm_u8(0), bi_zero()) : - bi_zero(); + /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */ + bi_index z = (nr > 2) + ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0), + bi_imm_u8(0), bi_zero()) + : bi_zero(); - dest = bi_mkvec_v2i8(b, - (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), - (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), - z); - } + dest = bi_mkvec_v2i8( + b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), + (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z); + } - /* Component 2: multisample index */ - if (ms_idx >= 0 && - (!nir_src_is_const(instr->src[ms_idx].src) || - nir_src_as_uint(instr->src[ms_idx].src) != 0)) { - dest = bi_mkvec_v2i16(b, dest, - bi_src_index(&instr->src[ms_idx].src)); - } + /* Component 2: multisample index */ + if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) || + nir_src_as_uint(instr->src[ms_idx].src) != 0)) { + dest = bi_mkvec_v2i16(b, dest, bi_src_index(&instr->src[ms_idx].src)); + } - /* Component 3: 8-bit LOD */ - if (lod_idx >= 0 && - (!nir_src_is_const(instr->src[lod_idx].src) || - nir_src_as_uint(instr->src[lod_idx].src) != 0) && - nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) { - dest = bi_lshift_or_i32(b, - bi_src_index(&instr->src[lod_idx].src), dest, - bi_imm_u8(24)); - } + /* Component 3: 8-bit LOD */ + if (lod_idx >= 0 && + (!nir_src_is_const(instr->src[lod_idx].src) || + nir_src_as_uint(instr->src[lod_idx].src) != 0) && + nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) { + dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest, + bi_imm_u8(24)); + } - return dest; + return dest; } static void -bi_emit_cube_coord(bi_builder *b, bi_index coord, - bi_index *face, bi_index *s, bi_index *t) +bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s, + bi_index *t) { - /* Compute max { |x|, |y|, |z| } */ - bi_index maxxyz = bi_temp(b->shader); - *face = bi_temp(b->shader); + /* Compute max { |x|, |y|, |z| } */ + bi_index maxxyz = bi_temp(b->shader); + *face = bi_temp(b->shader); - bi_index cx = bi_extract(b, coord, 0), - cy = bi_extract(b, coord, 1), - cz = bi_extract(b, coord, 2); + bi_index cx = bi_extract(b, coord, 0), cy = bi_extract(b, coord, 1), + cz = bi_extract(b, coord, 2); - /* Use a pseudo op on Bifrost due to tuple restrictions */ - if (b->shader->arch <= 8) { - bi_cubeface_to(b, maxxyz, *face, cx, cy, cz); - } else { - bi_cubeface1_to(b, maxxyz, cx, cy, cz); - bi_cubeface2_v9_to(b, *face, cx, cy, cz); - } + /* Use a pseudo op on Bifrost due to tuple restrictions */ + if (b->shader->arch <= 8) { + bi_cubeface_to(b, maxxyz, *face, cx, cy, cz); + } else { + bi_cubeface1_to(b, maxxyz, cx, cy, cz); + bi_cubeface2_v9_to(b, *face, cx, cy, cz); + } - /* Select coordinates */ - bi_index ssel = bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face); - bi_index tsel = bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), - *face); + /* Select coordinates */ + bi_index ssel = + bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face); + bi_index tsel = + bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), *face); - /* The OpenGL ES specification requires us to transform an input vector - * (x, y, z) to the coordinate, given the selected S/T: - * - * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1)) - * - * We implement (s shown, t similar) in a form friendlier to FMA - * instructions, and clamp coordinates at the end for correct - * NaN/infinity handling: - * - * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5) - * - * Take the reciprocal of max{x, y, z} - */ - bi_index rcp = bi_frcp_f32(b, maxxyz); + /* The OpenGL ES specification requires us to transform an input vector + * (x, y, z) to the coordinate, given the selected S/T: + * + * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1)) + * + * We implement (s shown, t similar) in a form friendlier to FMA + * instructions, and clamp coordinates at the end for correct + * NaN/infinity handling: + * + * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5) + * + * Take the reciprocal of max{x, y, z} + */ + bi_index rcp = bi_frcp_f32(b, maxxyz); - /* Calculate 0.5 * (1.0 / max{x, y, z}) */ - bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero()); + /* Calculate 0.5 * (1.0 / max{x, y, z}) */ + bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero()); - /* Transform the coordinates */ - *s = bi_temp(b->shader); - *t = bi_temp(b->shader); + /* Transform the coordinates */ + *s = bi_temp(b->shader); + *t = bi_temp(b->shader); - bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f)); - bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f)); + bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f)); + bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f)); - S->clamp = BI_CLAMP_CLAMP_0_1; - T->clamp = BI_CLAMP_CLAMP_0_1; + S->clamp = BI_CLAMP_CLAMP_0_1; + T->clamp = BI_CLAMP_CLAMP_0_1; } /* Emits a cube map descriptor, returning lower 32-bits and putting upper @@ -3383,10 +3359,10 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord, static bi_index bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t) { - bi_index face, s; - bi_emit_cube_coord(b, coord, &face, &s, t); - bi_index mask = bi_imm_u32(BITFIELD_MASK(29)); - return bi_mux_i32(b, s, face, mask, BI_MUX_BIT); + bi_index face, s; + bi_emit_cube_coord(b, coord, &face, &s, t); + bi_index mask = bi_imm_u32(BITFIELD_MASK(29)); + return bi_mux_i32(b, s, face, mask, BI_MUX_BIT); } /* Map to the main texture op used. Some of these (txd in particular) will @@ -3397,27 +3373,27 @@ bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t) static enum bifrost_tex_op bi_tex_op(nir_texop op) { - switch (op) { - case nir_texop_tex: - case nir_texop_txb: - case nir_texop_txl: - case nir_texop_txd: - case nir_texop_tex_prefetch: - return BIFROST_TEX_OP_TEX; - case nir_texop_txf: - case nir_texop_txf_ms: - case nir_texop_txf_ms_fb: - case nir_texop_tg4: - return BIFROST_TEX_OP_FETCH; - case nir_texop_txs: - case nir_texop_lod: - case nir_texop_query_levels: - case nir_texop_texture_samples: - case nir_texop_samples_identical: - unreachable("should've been lowered"); - default: - unreachable("unsupported tex op"); - } + switch (op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_txl: + case nir_texop_txd: + case nir_texop_tex_prefetch: + return BIFROST_TEX_OP_TEX; + case nir_texop_txf: + case nir_texop_txf_ms: + case nir_texop_txf_ms_fb: + case nir_texop_tg4: + return BIFROST_TEX_OP_FETCH; + case nir_texop_txs: + case nir_texop_lod: + case nir_texop_query_levels: + case nir_texop_texture_samples: + case nir_texop_samples_identical: + unreachable("should've been lowered"); + default: + unreachable("unsupported tex op"); + } } /* Data registers required by texturing in the order they appear. All are @@ -3426,422 +3402,415 @@ bi_tex_op(nir_texop op) * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */ enum bifrost_tex_dreg { - BIFROST_TEX_DREG_Z_COORD = 0, - BIFROST_TEX_DREG_Y_DELTAS = 1, - BIFROST_TEX_DREG_LOD = 2, - BIFROST_TEX_DREG_GRDESC_HI = 3, - BIFROST_TEX_DREG_SHADOW = 4, - BIFROST_TEX_DREG_ARRAY = 5, - BIFROST_TEX_DREG_OFFSETMS = 6, - BIFROST_TEX_DREG_SAMPLER = 7, - BIFROST_TEX_DREG_TEXTURE = 8, - BIFROST_TEX_DREG_COUNT, + BIFROST_TEX_DREG_Z_COORD = 0, + BIFROST_TEX_DREG_Y_DELTAS = 1, + BIFROST_TEX_DREG_LOD = 2, + BIFROST_TEX_DREG_GRDESC_HI = 3, + BIFROST_TEX_DREG_SHADOW = 4, + BIFROST_TEX_DREG_ARRAY = 5, + BIFROST_TEX_DREG_OFFSETMS = 6, + BIFROST_TEX_DREG_SAMPLER = 7, + BIFROST_TEX_DREG_TEXTURE = 8, + BIFROST_TEX_DREG_COUNT, }; static void bi_emit_texc(bi_builder *b, nir_tex_instr *instr) { - struct bifrost_texture_operation desc = { - .op = bi_tex_op(instr->op), - .offset_or_bias_disable = false, /* TODO */ - .shadow_or_clamp_disable = instr->is_shadow, - .array = instr->is_array, - .dimension = bifrost_tex_format(instr->sampler_dim), - .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */ - .mask = 0xF, - }; + struct bifrost_texture_operation desc = { + .op = bi_tex_op(instr->op), + .offset_or_bias_disable = false, /* TODO */ + .shadow_or_clamp_disable = instr->is_shadow, + .array = instr->is_array, + .dimension = bifrost_tex_format(instr->sampler_dim), + .format = + bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), + BI_CLAMP_NONE), /* TODO */ + .mask = 0xF, + }; - switch (desc.op) { - case BIFROST_TEX_OP_TEX: - desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE; - break; - case BIFROST_TEX_OP_FETCH: - desc.lod_or_fetch = (enum bifrost_lod_mode) - (instr->op == nir_texop_tg4 ? - BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component : - BIFROST_TEXTURE_FETCH_TEXEL); - break; - default: - unreachable("texture op unsupported"); - } + switch (desc.op) { + case BIFROST_TEX_OP_TEX: + desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE; + break; + case BIFROST_TEX_OP_FETCH: + desc.lod_or_fetch = (enum bifrost_lod_mode)( + instr->op == nir_texop_tg4 + ? BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component + : BIFROST_TEXTURE_FETCH_TEXEL); + break; + default: + unreachable("texture op unsupported"); + } - /* 32-bit indices to be allocated as consecutive staging registers */ - bi_index dregs[BIFROST_TEX_DREG_COUNT] = { }; - bi_index cx = bi_null(), cy = bi_null(); + /* 32-bit indices to be allocated as consecutive staging registers */ + bi_index dregs[BIFROST_TEX_DREG_COUNT] = {}; + bi_index cx = bi_null(), cy = bi_null(); - for (unsigned i = 0; i < instr->num_srcs; ++i) { - bi_index index = bi_src_index(&instr->src[i].src); - unsigned sz = nir_src_bit_size(instr->src[i].src); - unsigned components = nir_src_num_components(instr->src[i].src); - ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i); - nir_alu_type T = base | sz; + for (unsigned i = 0; i < instr->num_srcs; ++i) { + bi_index index = bi_src_index(&instr->src[i].src); + unsigned sz = nir_src_bit_size(instr->src[i].src); + unsigned components = nir_src_num_components(instr->src[i].src); + ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i); + nir_alu_type T = base | sz; - switch (instr->src[i].src_type) { - case nir_tex_src_coord: - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - cx = bi_emit_texc_cube_coord(b, index, &cy); - } else { - /* Copy XY (for 2D+) or XX (for 1D) */ - cx = bi_extract(b, index, 0); - cy = bi_extract(b, index, MIN2(1, components - 1)); + switch (instr->src[i].src_type) { + case nir_tex_src_coord: + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + cx = bi_emit_texc_cube_coord(b, index, &cy); + } else { + /* Copy XY (for 2D+) or XX (for 1D) */ + cx = bi_extract(b, index, 0); + cy = bi_extract(b, index, MIN2(1, components - 1)); - assert(components >= 1 && components <= 3); + assert(components >= 1 && components <= 3); - if (components == 3 && !desc.array) { - /* 3D */ - dregs[BIFROST_TEX_DREG_Z_COORD] = - bi_extract(b, index, 2); - } - } + if (components == 3 && !desc.array) { + /* 3D */ + dregs[BIFROST_TEX_DREG_Z_COORD] = bi_extract(b, index, 2); + } + } - if (desc.array) { - dregs[BIFROST_TEX_DREG_ARRAY] = - bi_emit_texc_array_index(b, - bi_extract(b, index, components - 1), T); - } + if (desc.array) { + dregs[BIFROST_TEX_DREG_ARRAY] = bi_emit_texc_array_index( + b, bi_extract(b, index, components - 1), T); + } - break; + break; - case nir_tex_src_lod: - if (desc.op == BIFROST_TEX_OP_TEX && - nir_src_is_const(instr->src[i].src) && - nir_src_as_uint(instr->src[i].src) == 0) { - desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO; - } else if (desc.op == BIFROST_TEX_OP_TEX) { - assert(base == nir_type_float); + case nir_tex_src_lod: + if (desc.op == BIFROST_TEX_OP_TEX && + nir_src_is_const(instr->src[i].src) && + nir_src_as_uint(instr->src[i].src) == 0) { + desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO; + } else if (desc.op == BIFROST_TEX_OP_TEX) { + assert(base == nir_type_float); - assert(sz == 16 || sz == 32); - dregs[BIFROST_TEX_DREG_LOD] = - bi_emit_texc_lod_88(b, index, sz == 16); - desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT; - } else { - assert(desc.op == BIFROST_TEX_OP_FETCH); - assert(base == nir_type_uint || base == nir_type_int); - assert(sz == 16 || sz == 32); + assert(sz == 16 || sz == 32); + dregs[BIFROST_TEX_DREG_LOD] = + bi_emit_texc_lod_88(b, index, sz == 16); + desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT; + } else { + assert(desc.op == BIFROST_TEX_OP_FETCH); + assert(base == nir_type_uint || base == nir_type_int); + assert(sz == 16 || sz == 32); - dregs[BIFROST_TEX_DREG_LOD] = - bi_emit_texc_lod_cube(b, index); - } + dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, index); + } - break; + break; - case nir_tex_src_bias: - /* Upper 16-bits interpreted as a clamp, leave zero */ - assert(desc.op == BIFROST_TEX_OP_TEX); - assert(base == nir_type_float); - assert(sz == 16 || sz == 32); - dregs[BIFROST_TEX_DREG_LOD] = - bi_emit_texc_lod_88(b, index, sz == 16); - desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS; - break; + case nir_tex_src_bias: + /* Upper 16-bits interpreted as a clamp, leave zero */ + assert(desc.op == BIFROST_TEX_OP_TEX); + assert(base == nir_type_float); + assert(sz == 16 || sz == 32); + dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16); + desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS; + break; - case nir_tex_src_ms_index: - case nir_tex_src_offset: - if (desc.offset_or_bias_disable) - break; + case nir_tex_src_ms_index: + case nir_tex_src_offset: + if (desc.offset_or_bias_disable) + break; - dregs[BIFROST_TEX_DREG_OFFSETMS] = - bi_emit_texc_offset_ms_index(b, instr); - if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero())) - desc.offset_or_bias_disable = true; - break; + dregs[BIFROST_TEX_DREG_OFFSETMS] = + bi_emit_texc_offset_ms_index(b, instr); + if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero())) + desc.offset_or_bias_disable = true; + break; - case nir_tex_src_comparator: - dregs[BIFROST_TEX_DREG_SHADOW] = index; - break; + case nir_tex_src_comparator: + dregs[BIFROST_TEX_DREG_SHADOW] = index; + break; - case nir_tex_src_texture_offset: - if (instr->texture_index) - index = bi_iadd_u32(b, index, bi_imm_u32(instr->texture_index), false); + case nir_tex_src_texture_offset: + if (instr->texture_index) + index = + bi_iadd_u32(b, index, bi_imm_u32(instr->texture_index), false); - dregs[BIFROST_TEX_DREG_TEXTURE] = index; + dregs[BIFROST_TEX_DREG_TEXTURE] = index; - break; + break; - case nir_tex_src_sampler_offset: - if (instr->sampler_index) - index = bi_iadd_u32(b, index, bi_imm_u32(instr->sampler_index), false); + case nir_tex_src_sampler_offset: + if (instr->sampler_index) + index = + bi_iadd_u32(b, index, bi_imm_u32(instr->sampler_index), false); - dregs[BIFROST_TEX_DREG_SAMPLER] = index; - break; + dregs[BIFROST_TEX_DREG_SAMPLER] = index; + break; - default: - unreachable("Unhandled src type in texc emit"); - } - } + default: + unreachable("Unhandled src type in texc emit"); + } + } - if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) { - dregs[BIFROST_TEX_DREG_LOD] = - bi_emit_texc_lod_cube(b, bi_zero()); - } + if (desc.op == BIFROST_TEX_OP_FETCH && + bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) { + dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, bi_zero()); + } - /* Choose an index mode */ + /* Choose an index mode */ - bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]); - bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]); - bool direct = direct_tex && direct_samp; + bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]); + bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]); + bool direct = direct_tex && direct_samp; - desc.immediate_indices = direct && (instr->sampler_index < 16); + desc.immediate_indices = direct && (instr->sampler_index < 16); - if (desc.immediate_indices) { - desc.sampler_index_or_mode = instr->sampler_index; - desc.index = instr->texture_index; - } else { - unsigned mode = 0; + if (desc.immediate_indices) { + desc.sampler_index_or_mode = instr->sampler_index; + desc.index = instr->texture_index; + } else { + unsigned mode = 0; - if (direct && instr->sampler_index == instr->texture_index) { - mode = BIFROST_INDEX_IMMEDIATE_SHARED; - desc.index = instr->texture_index; - } else if (direct) { - mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; - desc.index = instr->sampler_index; - dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b, - bi_imm_u32(instr->texture_index)); - } else if (direct_tex) { - assert(!direct_samp); - mode = BIFROST_INDEX_IMMEDIATE_TEXTURE; - desc.index = instr->texture_index; - } else if (direct_samp) { - assert(!direct_tex); - mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; - desc.index = instr->sampler_index; - } else { - mode = BIFROST_INDEX_REGISTER; - } + if (direct && instr->sampler_index == instr->texture_index) { + mode = BIFROST_INDEX_IMMEDIATE_SHARED; + desc.index = instr->texture_index; + } else if (direct) { + mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; + desc.index = instr->sampler_index; + dregs[BIFROST_TEX_DREG_TEXTURE] = + bi_mov_i32(b, bi_imm_u32(instr->texture_index)); + } else if (direct_tex) { + assert(!direct_samp); + mode = BIFROST_INDEX_IMMEDIATE_TEXTURE; + desc.index = instr->texture_index; + } else if (direct_samp) { + assert(!direct_tex); + mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; + desc.index = instr->sampler_index; + } else { + mode = BIFROST_INDEX_REGISTER; + } - mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2); - desc.sampler_index_or_mode = mode; - } + mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2); + desc.sampler_index_or_mode = mode; + } - /* Allocate staging registers contiguously by compacting the array. */ - unsigned sr_count = 0; + /* Allocate staging registers contiguously by compacting the array. */ + unsigned sr_count = 0; - for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) { - if (!bi_is_null(dregs[i])) - dregs[sr_count++] = dregs[i]; - } + for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) { + if (!bi_is_null(dregs[i])) + dregs[sr_count++] = dregs[i]; + } - unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4; + unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4; - bi_index sr = sr_count ? bi_temp(b->shader) : bi_null(); - bi_index dst = bi_temp(b->shader); + bi_index sr = sr_count ? bi_temp(b->shader) : bi_null(); + bi_index dst = bi_temp(b->shader); - if (sr_count) - bi_emit_collect_to(b, sr, dregs, sr_count); + if (sr_count) + bi_emit_collect_to(b, sr, dregs, sr_count); - uint32_t desc_u = 0; - memcpy(&desc_u, &desc, sizeof(desc_u)); - bi_instr *I = bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc_u), - !nir_tex_instr_has_implicit_derivative(instr), - sr_count, 0); - I->register_format = bi_reg_fmt_for_nir(instr->dest_type); + uint32_t desc_u = 0; + memcpy(&desc_u, &desc, sizeof(desc_u)); + bi_instr *I = + bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc_u), + !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0); + I->register_format = bi_reg_fmt_for_nir(instr->dest_type); - bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; - bi_emit_split_i32(b, w, dst, res_size); - bi_emit_collect_to(b, bi_dest_index(&instr->dest), w, - DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4)); + bi_index w[4] = {bi_null(), bi_null(), bi_null(), bi_null()}; + bi_emit_split_i32(b, w, dst, res_size); + bi_emit_collect_to( + b, bi_dest_index(&instr->dest), w, + DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4)); } /* Staging registers required by texturing in the order they appear (Valhall) */ enum valhall_tex_sreg { - VALHALL_TEX_SREG_X_COORD = 0, - VALHALL_TEX_SREG_Y_COORD = 1, - VALHALL_TEX_SREG_Z_COORD = 2, - VALHALL_TEX_SREG_Y_DELTAS = 3, - VALHALL_TEX_SREG_ARRAY = 4, - VALHALL_TEX_SREG_SHADOW = 5, - VALHALL_TEX_SREG_OFFSETMS = 6, - VALHALL_TEX_SREG_LOD = 7, - VALHALL_TEX_SREG_GRDESC = 8, - VALHALL_TEX_SREG_COUNT, + VALHALL_TEX_SREG_X_COORD = 0, + VALHALL_TEX_SREG_Y_COORD = 1, + VALHALL_TEX_SREG_Z_COORD = 2, + VALHALL_TEX_SREG_Y_DELTAS = 3, + VALHALL_TEX_SREG_ARRAY = 4, + VALHALL_TEX_SREG_SHADOW = 5, + VALHALL_TEX_SREG_OFFSETMS = 6, + VALHALL_TEX_SREG_LOD = 7, + VALHALL_TEX_SREG_GRDESC = 8, + VALHALL_TEX_SREG_COUNT, }; static void bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr) { - bool explicit_offset = false; - enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD; + bool explicit_offset = false; + enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD; - bool has_lod_mode = - (instr->op == nir_texop_tex) || - (instr->op == nir_texop_txl) || - (instr->op == nir_texop_txb); + bool has_lod_mode = (instr->op == nir_texop_tex) || + (instr->op == nir_texop_txl) || + (instr->op == nir_texop_txb); - /* 32-bit indices to be allocated as consecutive staging registers */ - bi_index sregs[VALHALL_TEX_SREG_COUNT] = { }; + /* 32-bit indices to be allocated as consecutive staging registers */ + bi_index sregs[VALHALL_TEX_SREG_COUNT] = {}; + bool has_sampler = nir_tex_instr_need_sampler(instr); + bi_index sampler = bi_imm_u32(has_sampler ? instr->sampler_index : 0); + bi_index texture = bi_imm_u32(instr->texture_index); + uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27); - bool has_sampler = nir_tex_instr_need_sampler(instr); - bi_index sampler = bi_imm_u32(has_sampler ? instr->sampler_index : 0); - bi_index texture = bi_imm_u32(instr->texture_index); - uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27); + for (unsigned i = 0; i < instr->num_srcs; ++i) { + bi_index index = bi_src_index(&instr->src[i].src); + unsigned sz = nir_src_bit_size(instr->src[i].src); + unsigned components = nir_src_num_components(instr->src[i].src); - for (unsigned i = 0; i < instr->num_srcs; ++i) { - bi_index index = bi_src_index(&instr->src[i].src); - unsigned sz = nir_src_bit_size(instr->src[i].src); - unsigned components = nir_src_num_components(instr->src[i].src); + switch (instr->src[i].src_type) { + case nir_tex_src_coord: + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord( + b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]); + } else { + assert(components >= 1 && components <= 3); - switch (instr->src[i].src_type) { - case nir_tex_src_coord: - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - sregs[VALHALL_TEX_SREG_X_COORD] = - bi_emit_texc_cube_coord(b, index, - &sregs[VALHALL_TEX_SREG_Y_COORD]); - } else { - assert(components >= 1 && components <= 3); + /* Copy XY (for 2D+) or XX (for 1D) */ + sregs[VALHALL_TEX_SREG_X_COORD] = index; - /* Copy XY (for 2D+) or XX (for 1D) */ - sregs[VALHALL_TEX_SREG_X_COORD] = index; + if (components >= 2) + sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1); - if (components >= 2) - sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1); + if (components == 3 && !instr->is_array) { + sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2); + } + } - if (components == 3 && !instr->is_array) { - sregs[VALHALL_TEX_SREG_Z_COORD] = - bi_extract(b, index, 2); - } - } + if (instr->is_array) { + sregs[VALHALL_TEX_SREG_ARRAY] = + bi_extract(b, index, components - 1); + } - if (instr->is_array) { - sregs[VALHALL_TEX_SREG_ARRAY] = - bi_extract(b, index, components - 1); - } + break; - break; + case nir_tex_src_lod: + if (nir_src_is_const(instr->src[i].src) && + nir_src_as_uint(instr->src[i].src) == 0) { + lod_mode = BI_VA_LOD_MODE_ZERO_LOD; + } else if (has_lod_mode) { + lod_mode = BI_VA_LOD_MODE_EXPLICIT; - case nir_tex_src_lod: - if (nir_src_is_const(instr->src[i].src) && - nir_src_as_uint(instr->src[i].src) == 0) { - lod_mode = BI_VA_LOD_MODE_ZERO_LOD; - } else if (has_lod_mode) { - lod_mode = BI_VA_LOD_MODE_EXPLICIT; + assert(sz == 16 || sz == 32); + sregs[VALHALL_TEX_SREG_LOD] = + bi_emit_texc_lod_88(b, index, sz == 16); + } + break; - assert(sz == 16 || sz == 32); - sregs[VALHALL_TEX_SREG_LOD] = - bi_emit_texc_lod_88(b, index, sz == 16); - } - break; + case nir_tex_src_bias: + /* Upper 16-bits interpreted as a clamp, leave zero */ + assert(sz == 16 || sz == 32); + sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16); - case nir_tex_src_bias: - /* Upper 16-bits interpreted as a clamp, leave zero */ - assert(sz == 16 || sz == 32); - sregs[VALHALL_TEX_SREG_LOD] = - bi_emit_texc_lod_88(b, index, sz == 16); + lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS; + break; + case nir_tex_src_ms_index: + case nir_tex_src_offset: + /* Handled below */ + break; - lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS; - break; - case nir_tex_src_ms_index: - case nir_tex_src_offset: - /* Handled below */ - break; + case nir_tex_src_comparator: + sregs[VALHALL_TEX_SREG_SHADOW] = index; + break; - case nir_tex_src_comparator: - sregs[VALHALL_TEX_SREG_SHADOW] = index; - break; + case nir_tex_src_texture_offset: + assert(instr->texture_index == 0); + texture = index; + break; - case nir_tex_src_texture_offset: - assert(instr->texture_index == 0); - texture = index; - break; + case nir_tex_src_sampler_offset: + assert(instr->sampler_index == 0); + sampler = index; + break; - case nir_tex_src_sampler_offset: - assert(instr->sampler_index == 0); - sampler = index; - break; + default: + unreachable("Unhandled src type in tex emit"); + } + } - default: - unreachable("Unhandled src type in tex emit"); - } - } + /* Generate packed offset + ms index + LOD register. These default to + * zero so we only need to encode if these features are actually in use. + */ + bi_index offsets = bi_emit_valhall_offsets(b, instr); - /* Generate packed offset + ms index + LOD register. These default to - * zero so we only need to encode if these features are actually in use. - */ - bi_index offsets = bi_emit_valhall_offsets(b, instr); + if (!bi_is_equiv(offsets, bi_zero())) { + sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets; + explicit_offset = true; + } - if (!bi_is_equiv(offsets, bi_zero())) { - sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets; - explicit_offset = true; - } + /* Allocate staging registers contiguously by compacting the array. */ + unsigned sr_count = 0; - /* Allocate staging registers contiguously by compacting the array. */ - unsigned sr_count = 0; + for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) { + if (!bi_is_null(sregs[i])) + sregs[sr_count++] = sregs[i]; + } - for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) { - if (!bi_is_null(sregs[i])) - sregs[sr_count++] = sregs[i]; - } + bi_index idx = sr_count ? bi_temp(b->shader) : bi_null(); - bi_index idx = sr_count ? bi_temp(b->shader) : bi_null(); + if (sr_count) + bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32); - if (sr_count) - bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32); + bi_index image_src = bi_imm_u32(tables); + image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0)); + image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16)); - bi_index image_src = bi_imm_u32(tables); - image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0)); - image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16)); + /* Only write the components that we actually read */ + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + unsigned comps_per_reg = nir_dest_bit_size(instr->dest) == 16 ? 2 : 1; + unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg); + enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type); + enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim); + bi_index dest = bi_temp(b->shader); - /* Only write the components that we actually read */ - unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); - unsigned comps_per_reg = nir_dest_bit_size(instr->dest) == 16 ? 2 : 1; - unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg); + switch (instr->op) { + case nir_texop_tex: + case nir_texop_txl: + case nir_texop_txb: + bi_tex_single_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim, + regfmt, instr->is_shadow, explicit_offset, lod_mode, + mask, sr_count); + break; + case nir_texop_txf: + case nir_texop_txf_ms: + bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim, + regfmt, explicit_offset, mask, sr_count); + break; + case nir_texop_tg4: + bi_tex_gather_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim, + instr->component, false, regfmt, instr->is_shadow, + explicit_offset, mask, sr_count); + break; + default: + unreachable("Unhandled Valhall texture op"); + } - enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type); - enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim); - bi_index dest = bi_temp(b->shader); + /* The hardware will write only what we read, and it will into + * contiguous registers without gaps (different from Bifrost). NIR + * expects the gaps, so fill in the holes (they'll be copypropped and + * DCE'd away later). + */ + bi_index unpacked[4] = {bi_null(), bi_null(), bi_null(), bi_null()}; - switch (instr->op) { - case nir_texop_tex: - case nir_texop_txl: - case nir_texop_txb: - bi_tex_single_to(b, dest, idx, image_src, bi_zero(), - instr->is_array, dim, regfmt, instr->is_shadow, - explicit_offset, lod_mode, mask, sr_count); - break; - case nir_texop_txf: - case nir_texop_txf_ms: - bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(), - instr->is_array, dim, regfmt, explicit_offset, - mask, sr_count); - break; - case nir_texop_tg4: - bi_tex_gather_to(b, dest, idx, image_src, bi_zero(), - instr->is_array, dim, instr->component, false, - regfmt, instr->is_shadow, explicit_offset, - mask, sr_count); - break; - default: - unreachable("Unhandled Valhall texture op"); - } + bi_emit_cached_split_i32(b, dest, res_size); - /* The hardware will write only what we read, and it will into - * contiguous registers without gaps (different from Bifrost). NIR - * expects the gaps, so fill in the holes (they'll be copypropped and - * DCE'd away later). - */ - bi_index unpacked[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; + /* Index into the packed component array */ + unsigned j = 0; + unsigned comps[4] = {0}; + unsigned nr_components = nir_dest_num_components(instr->dest); - bi_emit_cached_split_i32(b, dest, res_size); + for (unsigned i = 0; i < nr_components; ++i) { + if (mask & BITFIELD_BIT(i)) { + unpacked[i] = dest; + comps[i] = j++; + } else { + unpacked[i] = bi_zero(); + } + } - /* Index into the packed component array */ - unsigned j = 0; - unsigned comps[4] = { 0 }; - unsigned nr_components = nir_dest_num_components(instr->dest); - - for (unsigned i = 0; i < nr_components; ++i) { - if (mask & BITFIELD_BIT(i)) { - unpacked[i] = dest; - comps[i] = j++; - } else { - unpacked[i] = bi_zero(); - } - } - - bi_make_vec_to(b, bi_dest_index(&instr->dest), unpacked, - comps, nir_dest_num_components(instr->dest), - nir_dest_bit_size(instr->dest)); + bi_make_vec_to(b, bi_dest_index(&instr->dest), unpacked, comps, + nir_dest_num_components(instr->dest), + nir_dest_bit_size(instr->dest)); } /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube @@ -3851,114 +3820,112 @@ bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr) static void bi_emit_texs(bi_builder *b, nir_tex_instr *instr) { - int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); - assert(coord_idx >= 0); - bi_index coords = bi_src_index(&instr->src[coord_idx].src); + int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); + assert(coord_idx >= 0); + bi_index coords = bi_src_index(&instr->src[coord_idx].src); - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - bi_index face, s, t; - bi_emit_cube_coord(b, coords, &face, &s, &t); + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + bi_index face, s, t; + bi_emit_cube_coord(b, coords, &face, &s, &t); - bi_texs_cube_to(b, nir_dest_bit_size(instr->dest), - bi_dest_index(&instr->dest), - s, t, face, - instr->sampler_index, instr->texture_index); - } else { - bi_texs_2d_to(b, nir_dest_bit_size(instr->dest), - bi_dest_index(&instr->dest), - bi_extract(b, coords, 0), - bi_extract(b, coords, 1), - instr->op != nir_texop_tex, /* zero LOD */ - instr->sampler_index, instr->texture_index); - } + bi_texs_cube_to(b, nir_dest_bit_size(instr->dest), + bi_dest_index(&instr->dest), s, t, face, + instr->sampler_index, instr->texture_index); + } else { + bi_texs_2d_to(b, nir_dest_bit_size(instr->dest), + bi_dest_index(&instr->dest), bi_extract(b, coords, 0), + bi_extract(b, coords, 1), + instr->op != nir_texop_tex, /* zero LOD */ + instr->sampler_index, instr->texture_index); + } - bi_split_dest(b, instr->dest); + bi_split_dest(b, instr->dest); } static bool bi_is_simple_tex(nir_tex_instr *instr) { - if (instr->op != nir_texop_tex && instr->op != nir_texop_txl) - return false; + if (instr->op != nir_texop_tex && instr->op != nir_texop_txl) + return false; - if (instr->dest_type != nir_type_float32 && - instr->dest_type != nir_type_float16) - return false; + if (instr->dest_type != nir_type_float32 && + instr->dest_type != nir_type_float16) + return false; - if (instr->is_shadow || instr->is_array) - return false; + if (instr->is_shadow || instr->is_array) + return false; - switch (instr->sampler_dim) { - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_EXTERNAL: - case GLSL_SAMPLER_DIM_RECT: - break; + switch (instr->sampler_dim) { + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_EXTERNAL: + case GLSL_SAMPLER_DIM_RECT: + break; - case GLSL_SAMPLER_DIM_CUBE: - /* LOD can't be specified with TEXS_CUBE */ - if (instr->op == nir_texop_txl) - return false; - break; + case GLSL_SAMPLER_DIM_CUBE: + /* LOD can't be specified with TEXS_CUBE */ + if (instr->op == nir_texop_txl) + return false; + break; - default: - return false; - } + default: + return false; + } - for (unsigned i = 0; i < instr->num_srcs; ++i) { - if (instr->src[i].src_type != nir_tex_src_lod && - instr->src[i].src_type != nir_tex_src_coord) - return false; - } + for (unsigned i = 0; i < instr->num_srcs; ++i) { + if (instr->src[i].src_type != nir_tex_src_lod && + instr->src[i].src_type != nir_tex_src_coord) + return false; + } - /* Indices need to fit in provided bits */ - unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3; - if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits)) - return false; + /* Indices need to fit in provided bits */ + unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3; + if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits)) + return false; - int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); - if (lod_idx < 0) - return true; + int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); + if (lod_idx < 0) + return true; - nir_src lod = instr->src[lod_idx].src; - return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0; + nir_src lod = instr->src[lod_idx].src; + return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0; } static void bi_emit_tex(bi_builder *b, nir_tex_instr *instr) { - switch (instr->op) { - case nir_texop_txs: - bi_load_sysval_to(b, bi_dest_index(&instr->dest), - panfrost_sysval_for_instr(&instr->instr, NULL), - nir_dest_num_components(instr->dest), 0); - return; - case nir_texop_tex: - case nir_texop_txl: - case nir_texop_txb: - case nir_texop_txf: - case nir_texop_txf_ms: - case nir_texop_tg4: - break; - default: - unreachable("Invalid texture operation"); - } + switch (instr->op) { + case nir_texop_txs: + bi_load_sysval_to(b, bi_dest_index(&instr->dest), + panfrost_sysval_for_instr(&instr->instr, NULL), + nir_dest_num_components(instr->dest), 0); + return; + case nir_texop_tex: + case nir_texop_txl: + case nir_texop_txb: + case nir_texop_txf: + case nir_texop_txf_ms: + case nir_texop_tg4: + break; + default: + unreachable("Invalid texture operation"); + } - if (b->shader->arch >= 9) - bi_emit_tex_valhall(b, instr); - else if (bi_is_simple_tex(instr)) - bi_emit_texs(b, instr); - else - bi_emit_texc(b, instr); + if (b->shader->arch >= 9) + bi_emit_tex_valhall(b, instr); + else if (bi_is_simple_tex(instr)) + bi_emit_texs(b, instr); + else + bi_emit_texc(b, instr); } static void bi_emit_phi(bi_builder *b, nir_phi_instr *instr) { - unsigned nr_srcs = exec_list_length(&instr->srcs); - bi_instr *I = bi_phi_to(b, bi_dest_index(&instr->dest), nr_srcs); + unsigned nr_srcs = exec_list_length(&instr->srcs); + bi_instr *I = bi_phi_to(b, bi_dest_index(&instr->dest), nr_srcs); - /* Deferred */ - I->phi = instr; + /* Deferred */ + I->phi = instr; } /* Look up the AGX block corresponding to a given NIR block. Used when @@ -3967,266 +3934,267 @@ bi_emit_phi(bi_builder *b, nir_phi_instr *instr) static bi_block * bi_from_nir_block(bi_context *ctx, nir_block *block) { - return ctx->indexed_nir_blocks[block->index]; + return ctx->indexed_nir_blocks[block->index]; } static void bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I) { - nir_phi_instr *phi = I->phi; + nir_phi_instr *phi = I->phi; - /* Guaranteed by lower_phis_to_scalar */ - assert(phi->dest.ssa.num_components == 1); + /* Guaranteed by lower_phis_to_scalar */ + assert(phi->dest.ssa.num_components == 1); - nir_foreach_phi_src(src, phi) { - bi_block *pred = bi_from_nir_block(ctx, src->pred); - unsigned i = bi_predecessor_index(block, pred); - assert(i < I->nr_srcs); + nir_foreach_phi_src(src, phi) { + bi_block *pred = bi_from_nir_block(ctx, src->pred); + unsigned i = bi_predecessor_index(block, pred); + assert(i < I->nr_srcs); - I->src[i] = bi_src_index(&src->src); - } + I->src[i] = bi_src_index(&src->src); + } - I->phi = NULL; + I->phi = NULL; } static void bi_emit_phis_deferred(bi_context *ctx) { - bi_foreach_block(ctx, block) { - bi_foreach_instr_in_block(block, I) { - if (I->op == BI_OPCODE_PHI) - bi_emit_phi_deferred(ctx, block, I); - } - } + bi_foreach_block(ctx, block) { + bi_foreach_instr_in_block(block, I) { + if (I->op == BI_OPCODE_PHI) + bi_emit_phi_deferred(ctx, block, I); + } + } } static void bi_emit_instr(bi_builder *b, struct nir_instr *instr) { - switch (instr->type) { - case nir_instr_type_load_const: - bi_emit_load_const(b, nir_instr_as_load_const(instr)); - break; + switch (instr->type) { + case nir_instr_type_load_const: + bi_emit_load_const(b, nir_instr_as_load_const(instr)); + break; - case nir_instr_type_intrinsic: - bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr)); - break; + case nir_instr_type_intrinsic: + bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr)); + break; - case nir_instr_type_alu: - bi_emit_alu(b, nir_instr_as_alu(instr)); - break; + case nir_instr_type_alu: + bi_emit_alu(b, nir_instr_as_alu(instr)); + break; - case nir_instr_type_tex: - bi_emit_tex(b, nir_instr_as_tex(instr)); - break; + case nir_instr_type_tex: + bi_emit_tex(b, nir_instr_as_tex(instr)); + break; - case nir_instr_type_jump: - bi_emit_jump(b, nir_instr_as_jump(instr)); - break; + case nir_instr_type_jump: + bi_emit_jump(b, nir_instr_as_jump(instr)); + break; - case nir_instr_type_phi: - bi_emit_phi(b, nir_instr_as_phi(instr)); - break; + case nir_instr_type_phi: + bi_emit_phi(b, nir_instr_as_phi(instr)); + break; - default: - unreachable("should've been lowered"); - } + default: + unreachable("should've been lowered"); + } } static bi_block * create_empty_block(bi_context *ctx) { - bi_block *blk = rzalloc(ctx, bi_block); + bi_block *blk = rzalloc(ctx, bi_block); - util_dynarray_init(&blk->predecessors, blk); + util_dynarray_init(&blk->predecessors, blk); - return blk; + return blk; } static bi_block * emit_block(bi_context *ctx, nir_block *block) { - if (ctx->after_block) { - ctx->current_block = ctx->after_block; - ctx->after_block = NULL; - } else { - ctx->current_block = create_empty_block(ctx); - } + if (ctx->after_block) { + ctx->current_block = ctx->after_block; + ctx->after_block = NULL; + } else { + ctx->current_block = create_empty_block(ctx); + } - list_addtail(&ctx->current_block->link, &ctx->blocks); - list_inithead(&ctx->current_block->instructions); + list_addtail(&ctx->current_block->link, &ctx->blocks); + list_inithead(&ctx->current_block->instructions); - bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); + bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); - ctx->indexed_nir_blocks[block->index] = ctx->current_block; + ctx->indexed_nir_blocks[block->index] = ctx->current_block; - nir_foreach_instr(instr, block) { - bi_emit_instr(&_b, instr); - } + nir_foreach_instr(instr, block) { + bi_emit_instr(&_b, instr); + } - return ctx->current_block; + return ctx->current_block; } static void emit_if(bi_context *ctx, nir_if *nif) { - bi_block *before_block = ctx->current_block; + bi_block *before_block = ctx->current_block; - /* Speculatively emit the branch, but we can't fill it in until later */ - bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); - bi_instr *then_branch = bi_branchz_i16(&_b, - bi_half(bi_src_index(&nif->condition), false), - bi_zero(), BI_CMPF_EQ); + /* Speculatively emit the branch, but we can't fill it in until later */ + bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); + bi_instr *then_branch = + bi_branchz_i16(&_b, bi_half(bi_src_index(&nif->condition), false), + bi_zero(), BI_CMPF_EQ); - /* Emit the two subblocks. */ - bi_block *then_block = emit_cf_list(ctx, &nif->then_list); - bi_block *end_then_block = ctx->current_block; + /* Emit the two subblocks. */ + bi_block *then_block = emit_cf_list(ctx, &nif->then_list); + bi_block *end_then_block = ctx->current_block; - /* Emit second block */ + /* Emit second block */ - bi_block *else_block = emit_cf_list(ctx, &nif->else_list); - bi_block *end_else_block = ctx->current_block; - ctx->after_block = create_empty_block(ctx); + bi_block *else_block = emit_cf_list(ctx, &nif->else_list); + bi_block *end_else_block = ctx->current_block; + ctx->after_block = create_empty_block(ctx); - /* Now that we have the subblocks emitted, fix up the branches */ + /* Now that we have the subblocks emitted, fix up the branches */ - assert(then_block); - assert(else_block); + assert(then_block); + assert(else_block); - then_branch->branch_target = else_block; + then_branch->branch_target = else_block; - /* Emit a jump from the end of the then block to the end of the else */ - _b.cursor = bi_after_block(end_then_block); - bi_instr *then_exit = bi_jump(&_b, bi_zero()); - then_exit->branch_target = ctx->after_block; + /* Emit a jump from the end of the then block to the end of the else */ + _b.cursor = bi_after_block(end_then_block); + bi_instr *then_exit = bi_jump(&_b, bi_zero()); + then_exit->branch_target = ctx->after_block; - bi_block_add_successor(end_then_block, then_exit->branch_target); - bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */ + bi_block_add_successor(end_then_block, then_exit->branch_target); + bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */ - bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */ - bi_block_add_successor(before_block, then_block); /* fallthrough */ + bi_block_add_successor(before_block, + then_branch->branch_target); /* then_branch */ + bi_block_add_successor(before_block, then_block); /* fallthrough */ } static void emit_loop(bi_context *ctx, nir_loop *nloop) { - /* Remember where we are */ - bi_block *start_block = ctx->current_block; + /* Remember where we are */ + bi_block *start_block = ctx->current_block; - bi_block *saved_break = ctx->break_block; - bi_block *saved_continue = ctx->continue_block; + bi_block *saved_break = ctx->break_block; + bi_block *saved_continue = ctx->continue_block; - ctx->continue_block = create_empty_block(ctx); - ctx->break_block = create_empty_block(ctx); - ctx->after_block = ctx->continue_block; + ctx->continue_block = create_empty_block(ctx); + ctx->break_block = create_empty_block(ctx); + ctx->after_block = ctx->continue_block; - /* Emit the body itself */ - emit_cf_list(ctx, &nloop->body); + /* Emit the body itself */ + emit_cf_list(ctx, &nloop->body); - /* Branch back to loop back */ - bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); - bi_instr *I = bi_jump(&_b, bi_zero()); - I->branch_target = ctx->continue_block; - bi_block_add_successor(start_block, ctx->continue_block); - bi_block_add_successor(ctx->current_block, ctx->continue_block); + /* Branch back to loop back */ + bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); + bi_instr *I = bi_jump(&_b, bi_zero()); + I->branch_target = ctx->continue_block; + bi_block_add_successor(start_block, ctx->continue_block); + bi_block_add_successor(ctx->current_block, ctx->continue_block); - ctx->after_block = ctx->break_block; + ctx->after_block = ctx->break_block; - /* Pop off */ - ctx->break_block = saved_break; - ctx->continue_block = saved_continue; - ++ctx->loop_count; + /* Pop off */ + ctx->break_block = saved_break; + ctx->continue_block = saved_continue; + ++ctx->loop_count; } static bi_block * emit_cf_list(bi_context *ctx, struct exec_list *list) { - bi_block *start_block = NULL; + bi_block *start_block = NULL; - foreach_list_typed(nir_cf_node, node, node, list) { - switch (node->type) { - case nir_cf_node_block: { - bi_block *block = emit_block(ctx, nir_cf_node_as_block(node)); + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: { + bi_block *block = emit_block(ctx, nir_cf_node_as_block(node)); - if (!start_block) - start_block = block; + if (!start_block) + start_block = block; - break; - } + break; + } - case nir_cf_node_if: - emit_if(ctx, nir_cf_node_as_if(node)); - break; + case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; - case nir_cf_node_loop: - emit_loop(ctx, nir_cf_node_as_loop(node)); - break; + case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; - default: - unreachable("Unknown control flow"); - } - } + default: + unreachable("Unknown control flow"); + } + } - return start_block; + return start_block; } /* shader-db stuff */ struct bi_stats { - unsigned nr_clauses, nr_tuples, nr_ins; - unsigned nr_arith, nr_texture, nr_varying, nr_ldst; + unsigned nr_clauses, nr_tuples, nr_ins; + unsigned nr_arith, nr_texture, nr_varying, nr_ldst; }; static void bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats) { - /* Count instructions */ - stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0); + /* Count instructions */ + stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0); - /* Non-message passing tuples are always arithmetic */ - if (tuple->add != clause->message) { - stats->nr_arith++; - return; - } + /* Non-message passing tuples are always arithmetic */ + if (tuple->add != clause->message) { + stats->nr_arith++; + return; + } - /* Message + FMA we'll count as arithmetic _and_ message */ - if (tuple->fma) - stats->nr_arith++; + /* Message + FMA we'll count as arithmetic _and_ message */ + if (tuple->fma) + stats->nr_arith++; - switch (clause->message_type) { - case BIFROST_MESSAGE_VARYING: - /* Check components interpolated */ - stats->nr_varying += (clause->message->vecsize + 1) * - (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2); - break; + switch (clause->message_type) { + case BIFROST_MESSAGE_VARYING: + /* Check components interpolated */ + stats->nr_varying += + (clause->message->vecsize + 1) * + (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2); + break; - case BIFROST_MESSAGE_VARTEX: - /* 2 coordinates, fp32 each */ - stats->nr_varying += (2 * 2); - FALLTHROUGH; - case BIFROST_MESSAGE_TEX: - stats->nr_texture++; - break; + case BIFROST_MESSAGE_VARTEX: + /* 2 coordinates, fp32 each */ + stats->nr_varying += (2 * 2); + FALLTHROUGH; + case BIFROST_MESSAGE_TEX: + stats->nr_texture++; + break; - case BIFROST_MESSAGE_ATTRIBUTE: - case BIFROST_MESSAGE_LOAD: - case BIFROST_MESSAGE_STORE: - case BIFROST_MESSAGE_ATOMIC: - stats->nr_ldst++; - break; - - case BIFROST_MESSAGE_NONE: - case BIFROST_MESSAGE_BARRIER: - case BIFROST_MESSAGE_BLEND: - case BIFROST_MESSAGE_TILE: - case BIFROST_MESSAGE_Z_STENCIL: - case BIFROST_MESSAGE_ATEST: - case BIFROST_MESSAGE_JOB: - case BIFROST_MESSAGE_64BIT: - /* Nothing to do */ - break; - }; + case BIFROST_MESSAGE_ATTRIBUTE: + case BIFROST_MESSAGE_LOAD: + case BIFROST_MESSAGE_STORE: + case BIFROST_MESSAGE_ATOMIC: + stats->nr_ldst++; + break; + case BIFROST_MESSAGE_NONE: + case BIFROST_MESSAGE_BARRIER: + case BIFROST_MESSAGE_BLEND: + case BIFROST_MESSAGE_TILE: + case BIFROST_MESSAGE_Z_STENCIL: + case BIFROST_MESSAGE_ATEST: + case BIFROST_MESSAGE_JOB: + case BIFROST_MESSAGE_64BIT: + /* Nothing to do */ + break; + }; } /* @@ -4238,151 +4206,150 @@ bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats) static unsigned bi_count_preload_cost(bi_context *ctx) { - /* Units: 1/16 of a normalized cycle, assuming that we may interpolate - * 16 fp16 varying components per cycle or fetch two texels per cycle. - */ - unsigned cost = 0; + /* Units: 1/16 of a normalized cycle, assuming that we may interpolate + * 16 fp16 varying components per cycle or fetch two texels per cycle. + */ + unsigned cost = 0; - for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) { - struct bifrost_message_preload msg = ctx->info.bifrost->messages[i]; + for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) { + struct bifrost_message_preload msg = ctx->info.bifrost->messages[i]; - if (msg.enabled && msg.texture) { - /* 2 coordinate, 2 half-words each, plus texture */ - cost += 12; - } else if (msg.enabled) { - cost += (msg.num_components * (msg.fp16 ? 1 : 2)); - } - } + if (msg.enabled && msg.texture) { + /* 2 coordinate, 2 half-words each, plus texture */ + cost += 12; + } else if (msg.enabled) { + cost += (msg.num_components * (msg.fp16 ? 1 : 2)); + } + } - return cost; + return cost; } static const char * bi_shader_stage_name(bi_context *ctx) { - if (ctx->idvs == BI_IDVS_VARYING) - return "MESA_SHADER_VARYING"; - else if (ctx->idvs == BI_IDVS_POSITION) - return "MESA_SHADER_POSITION"; - else if (ctx->inputs->is_blend) - return "MESA_SHADER_BLEND"; - else - return gl_shader_stage_name(ctx->stage); + if (ctx->idvs == BI_IDVS_VARYING) + return "MESA_SHADER_VARYING"; + else if (ctx->idvs == BI_IDVS_POSITION) + return "MESA_SHADER_POSITION"; + else if (ctx->inputs->is_blend) + return "MESA_SHADER_BLEND"; + else + return gl_shader_stage_name(ctx->stage); } static char * bi_print_stats(bi_context *ctx, unsigned size) { - struct bi_stats stats = { 0 }; + struct bi_stats stats = {0}; - /* Count instructions, clauses, and tuples. Also attempt to construct - * normalized execution engine cycle counts, using the following ratio: - * - * 24 arith tuples/cycle - * 2 texture messages/cycle - * 16 x 16-bit varying channels interpolated/cycle - * 1 load store message/cycle - * - * These numbers seem to match Arm Mobile Studio's heuristic. The real - * cycle counts are surely more complicated. - */ + /* Count instructions, clauses, and tuples. Also attempt to construct + * normalized execution engine cycle counts, using the following ratio: + * + * 24 arith tuples/cycle + * 2 texture messages/cycle + * 16 x 16-bit varying channels interpolated/cycle + * 1 load store message/cycle + * + * These numbers seem to match Arm Mobile Studio's heuristic. The real + * cycle counts are surely more complicated. + */ - bi_foreach_block(ctx, block) { - bi_foreach_clause_in_block(block, clause) { - stats.nr_clauses++; - stats.nr_tuples += clause->tuple_count; + bi_foreach_block(ctx, block) { + bi_foreach_clause_in_block(block, clause) { + stats.nr_clauses++; + stats.nr_tuples += clause->tuple_count; - for (unsigned i = 0; i < clause->tuple_count; ++i) - bi_count_tuple_stats(clause, &clause->tuples[i], &stats); - } - } + for (unsigned i = 0; i < clause->tuple_count; ++i) + bi_count_tuple_stats(clause, &clause->tuples[i], &stats); + } + } - float cycles_arith = ((float) stats.nr_arith) / 24.0; - float cycles_texture = ((float) stats.nr_texture) / 2.0; - float cycles_varying = ((float) stats.nr_varying) / 16.0; - float cycles_ldst = ((float) stats.nr_ldst) / 1.0; + float cycles_arith = ((float)stats.nr_arith) / 24.0; + float cycles_texture = ((float)stats.nr_texture) / 2.0; + float cycles_varying = ((float)stats.nr_varying) / 16.0; + float cycles_ldst = ((float)stats.nr_ldst) / 1.0; - float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst); - float cycles_bound = MAX2(cycles_arith, cycles_message); + float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst); + float cycles_bound = MAX2(cycles_arith, cycles_message); - /* Thread count and register pressure are traded off only on v7 */ - bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32); - unsigned nr_threads = full_threads ? 2 : 1; + /* Thread count and register pressure are traded off only on v7 */ + bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32); + unsigned nr_threads = full_threads ? 2 : 1; - /* Dump stats */ - char *str = ralloc_asprintf(NULL, "%s shader: " - "%u inst, %u tuples, %u clauses, " - "%f cycles, %f arith, %f texture, %f vary, %f ldst, " - "%u quadwords, %u threads", - bi_shader_stage_name(ctx), - stats.nr_ins, stats.nr_tuples, stats.nr_clauses, - cycles_bound, cycles_arith, cycles_texture, - cycles_varying, cycles_ldst, - size / 16, nr_threads); + /* Dump stats */ + char *str = ralloc_asprintf( + NULL, + "%s shader: " + "%u inst, %u tuples, %u clauses, " + "%f cycles, %f arith, %f texture, %f vary, %f ldst, " + "%u quadwords, %u threads", + bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples, + stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture, + cycles_varying, cycles_ldst, size / 16, nr_threads); - if (ctx->arch == 7) { - ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx)); - } + if (ctx->arch == 7) { + ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx)); + } - ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills", - ctx->loop_count, ctx->spills, ctx->fills); + ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills", + ctx->loop_count, ctx->spills, ctx->fills); - return str; + return str; } static char * va_print_stats(bi_context *ctx, unsigned size) { - unsigned nr_ins = 0; - struct va_stats stats = { 0 }; + unsigned nr_ins = 0; + struct va_stats stats = {0}; - /* Count instructions */ - bi_foreach_instr_global(ctx, I) { - nr_ins++; - va_count_instr_stats(I, &stats); - } + /* Count instructions */ + bi_foreach_instr_global(ctx, I) { + nr_ins++; + va_count_instr_stats(I, &stats); + } - /* Mali G78 peak performance: - * - * 64 FMA instructions per cycle - * 64 CVT instructions per cycle - * 16 SFU instructions per cycle - * 8 x 32-bit varying channels interpolated per cycle - * 4 texture instructions per cycle - * 1 load/store operation per cycle - */ + /* Mali G78 peak performance: + * + * 64 FMA instructions per cycle + * 64 CVT instructions per cycle + * 16 SFU instructions per cycle + * 8 x 32-bit varying channels interpolated per cycle + * 4 texture instructions per cycle + * 1 load/store operation per cycle + */ - float cycles_fma = ((float) stats.fma) / 64.0; - float cycles_cvt = ((float) stats.cvt) / 64.0; - float cycles_sfu = ((float) stats.sfu) / 16.0; - float cycles_v = ((float) stats.v) / 16.0; - float cycles_t = ((float) stats.t) / 4.0; - float cycles_ls = ((float) stats.ls) / 1.0; + float cycles_fma = ((float)stats.fma) / 64.0; + float cycles_cvt = ((float)stats.cvt) / 64.0; + float cycles_sfu = ((float)stats.sfu) / 16.0; + float cycles_v = ((float)stats.v) / 16.0; + float cycles_t = ((float)stats.t) / 4.0; + float cycles_ls = ((float)stats.ls) / 1.0; - /* Calculate the bound */ - float cycles = MAX2( - MAX3(cycles_fma, cycles_cvt, cycles_sfu), - MAX3(cycles_v, cycles_t, cycles_ls)); + /* Calculate the bound */ + float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu), + MAX3(cycles_v, cycles_t, cycles_ls)); + /* Thread count and register pressure are traded off */ + unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1; - /* Thread count and register pressure are traded off */ - unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1; - - /* Dump stats */ - return ralloc_asprintf(NULL, "%s shader: " - "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, " - "%f t, %f ls, %u quadwords, %u threads, %u loops, " - "%u:%u spills:fills", - bi_shader_stage_name(ctx), - nr_ins, cycles, cycles_fma, cycles_cvt, cycles_sfu, - cycles_v, cycles_t, cycles_ls, size / 16, nr_threads, - ctx->loop_count, ctx->spills, ctx->fills); + /* Dump stats */ + return ralloc_asprintf(NULL, + "%s shader: " + "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, " + "%f t, %f ls, %u quadwords, %u threads, %u loops, " + "%u:%u spills:fills", + bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma, + cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls, + size / 16, nr_threads, ctx->loop_count, ctx->spills, + ctx->fills); } static int glsl_type_size(const struct glsl_type *type, bool bindless) { - return glsl_count_attribute_slots(type, false); + return glsl_count_attribute_slots(type, false); } /* Split stores to memory. We don't split stores to vertex outputs, since @@ -4392,17 +4359,17 @@ glsl_type_size(const struct glsl_type *type, bool bindless) static bool should_split_wrmask(const nir_instr *instr, UNUSED const void *data) { - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - switch (intr->intrinsic) { - case nir_intrinsic_store_ssbo: - case nir_intrinsic_store_shared: - case nir_intrinsic_store_global: - case nir_intrinsic_store_scratch: - return true; - default: - return false; - } + switch (intr->intrinsic) { + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_shared: + case nir_intrinsic_store_global: + case nir_intrinsic_store_scratch: + return true; + default: + return false; + } } /* @@ -4413,23 +4380,23 @@ should_split_wrmask(const nir_instr *instr, UNUSED const void *data) static unsigned bi_lower_bit_size(const nir_instr *instr, UNUSED void *data) { - if (instr->type != nir_instr_type_alu) - return 0; + if (instr->type != nir_instr_type_alu) + return 0; - nir_alu_instr *alu = nir_instr_as_alu(instr); + nir_alu_instr *alu = nir_instr_as_alu(instr); - switch (alu->op) { - case nir_op_fexp2: - case nir_op_flog2: - case nir_op_fpow: - case nir_op_fsin: - case nir_op_fcos: - case nir_op_bit_count: - case nir_op_bitfield_reverse: - return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32; - default: - return 0; - } + switch (alu->op) { + case nir_op_fexp2: + case nir_op_flog2: + case nir_op_fpow: + case nir_op_fsin: + case nir_op_fcos: + case nir_op_bit_count: + case nir_op_bitfield_reverse: + return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32; + default: + return 0; + } } /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4, @@ -4440,64 +4407,64 @@ bi_lower_bit_size(const nir_instr *instr, UNUSED void *data) static uint8_t bi_vectorize_filter(const nir_instr *instr, const void *data) { - /* Defaults work for everything else */ - if (instr->type != nir_instr_type_alu) - return 0; + /* Defaults work for everything else */ + if (instr->type != nir_instr_type_alu) + return 0; - const nir_alu_instr *alu = nir_instr_as_alu(instr); + const nir_alu_instr *alu = nir_instr_as_alu(instr); - switch (alu->op) { - case nir_op_frcp: - case nir_op_frsq: - case nir_op_ishl: - case nir_op_ishr: - case nir_op_ushr: - case nir_op_f2i16: - case nir_op_f2u16: - case nir_op_extract_u8: - case nir_op_extract_i8: - case nir_op_extract_u16: - case nir_op_extract_i16: - case nir_op_insert_u16: - return 1; - default: - break; - } + switch (alu->op) { + case nir_op_frcp: + case nir_op_frsq: + case nir_op_ishl: + case nir_op_ishr: + case nir_op_ushr: + case nir_op_f2i16: + case nir_op_f2u16: + case nir_op_extract_u8: + case nir_op_extract_i8: + case nir_op_extract_u16: + case nir_op_extract_i16: + case nir_op_insert_u16: + return 1; + default: + break; + } - /* Vectorized instructions cannot write more than 32-bit */ - int dst_bit_size = nir_dest_bit_size(alu->dest.dest); - if (dst_bit_size == 16) - return 2; - else - return 1; + /* Vectorized instructions cannot write more than 32-bit */ + int dst_bit_size = nir_dest_bit_size(alu->dest.dest); + if (dst_bit_size == 16) + return 2; + else + return 1; } static bool bi_scalarize_filter(const nir_instr *instr, const void *data) { - if (instr->type != nir_instr_type_alu) - return false; + if (instr->type != nir_instr_type_alu) + return false; - const nir_alu_instr *alu = nir_instr_as_alu(instr); + const nir_alu_instr *alu = nir_instr_as_alu(instr); - switch (alu->op) { - case nir_op_pack_uvec2_to_uint: - case nir_op_pack_uvec4_to_uint: - return false; - default: - return true; - } + switch (alu->op) { + case nir_op_pack_uvec2_to_uint: + case nir_op_pack_uvec4_to_uint: + return false; + default: + return true; + } } /* Ensure we write exactly 4 components */ static nir_ssa_def * -bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in, - unsigned channel, unsigned first, unsigned mask) +bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in, unsigned channel, + unsigned first, unsigned mask) { - if (!(mask & BITFIELD_BIT(channel))) - channel = first; + if (!(mask & BITFIELD_BIT(channel))) + channel = first; - return nir_channel(b, in, channel); + return nir_channel(b, in, channel); } /* Lower fragment store_output instructions to always write 4 components, @@ -4506,202 +4473,197 @@ bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in, * compiler. The DDK inserts these moves, so we will as well. */ static bool -bifrost_nir_lower_blend_components(struct nir_builder *b, - nir_instr *instr, void *data) +bifrost_nir_lower_blend_components(struct nir_builder *b, nir_instr *instr, + void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - return false; + if (intr->intrinsic != nir_intrinsic_store_output) + return false; - nir_ssa_def *in = intr->src[0].ssa; - unsigned first = nir_intrinsic_component(intr); - unsigned mask = nir_intrinsic_write_mask(intr); + nir_ssa_def *in = intr->src[0].ssa; + unsigned first = nir_intrinsic_component(intr); + unsigned mask = nir_intrinsic_write_mask(intr); - assert(first == 0 && "shouldn't get nonzero components"); + assert(first == 0 && "shouldn't get nonzero components"); - /* Nothing to do */ - if (mask == BITFIELD_MASK(4)) - return false; + /* Nothing to do */ + if (mask == BITFIELD_MASK(4)) + return false; - b->cursor = nir_before_instr(&intr->instr); + b->cursor = nir_before_instr(&intr->instr); - /* Replicate the first valid component instead */ - nir_ssa_def *replicated = - nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask), - bifrost_nir_valid_channel(b, in, 1, first, mask), - bifrost_nir_valid_channel(b, in, 2, first, mask), - bifrost_nir_valid_channel(b, in, 3, first, mask)); + /* Replicate the first valid component instead */ + nir_ssa_def *replicated = + nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask), + bifrost_nir_valid_channel(b, in, 1, first, mask), + bifrost_nir_valid_channel(b, in, 2, first, mask), + bifrost_nir_valid_channel(b, in, 3, first, mask)); - /* Rewrite to use our replicated version */ - nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated); - nir_intrinsic_set_component(intr, 0); - nir_intrinsic_set_write_mask(intr, 0xF); - intr->num_components = 4; + /* Rewrite to use our replicated version */ + nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated); + nir_intrinsic_set_component(intr, 0); + nir_intrinsic_set_write_mask(intr, 0xF); + intr->num_components = 4; - return true; + return true; } static void bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) { - bool progress; - unsigned lower_flrp = 16 | 32 | 64; + bool progress; + unsigned lower_flrp = 16 | 32 | 64; - NIR_PASS(progress, nir, nir_lower_regs_to_ssa); + NIR_PASS(progress, nir, nir_lower_regs_to_ssa); - nir_lower_tex_options lower_tex_options = { - .lower_txs_lod = true, - .lower_txp = ~0, - .lower_tg4_broadcom_swizzle = true, - .lower_txd = true, - .lower_invalid_implicit_lod = true, - }; + nir_lower_tex_options lower_tex_options = { + .lower_txs_lod = true, + .lower_txp = ~0, + .lower_tg4_broadcom_swizzle = true, + .lower_txd = true, + .lower_invalid_implicit_lod = true, + }; - NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin); - NIR_PASS(progress, nir, pan_lower_helper_invocation); + NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin); + NIR_PASS(progress, nir, pan_lower_helper_invocation); - NIR_PASS(progress, nir, nir_lower_int64); + NIR_PASS(progress, nir, nir_lower_int64); - nir_lower_idiv_options idiv_options = { - .allow_fp16 = true, - }; - NIR_PASS(progress, nir, nir_opt_idiv_const, 8); - NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); + nir_lower_idiv_options idiv_options = { + .allow_fp16 = true, + }; + NIR_PASS(progress, nir, nir_opt_idiv_const, 8); + NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); - NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); - NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); - NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); - NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true); + NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); + NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); + NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); + NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true); - do { - progress = false; + do { + progress = false; - NIR_PASS(progress, nir, nir_lower_var_copies); - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL); + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_remove_phis); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_dead_cf); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_remove_phis); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_lower_alu); + NIR_PASS(progress, nir, nir_lower_alu); - if (lower_flrp != 0) { - bool lower_flrp_progress = false; - NIR_PASS(lower_flrp_progress, - nir, - nir_lower_flrp, - lower_flrp, - false /* always_precise */); - if (lower_flrp_progress) { - NIR_PASS(progress, nir, - nir_opt_constant_folding); - progress = true; - } + if (lower_flrp != 0) { + bool lower_flrp_progress = false; + NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, + false /* always_precise */); + if (lower_flrp_progress) { + NIR_PASS(progress, nir, nir_opt_constant_folding); + progress = true; + } - /* Nothing should rematerialize any flrps, so we only - * need to do this lowering once. - */ - lower_flrp = 0; - } + /* Nothing should rematerialize any flrps, so we only + * need to do this lowering once. + */ + lower_flrp = 0; + } - NIR_PASS(progress, nir, nir_opt_undef); - NIR_PASS(progress, nir, nir_lower_undef_to_zero); + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_lower_undef_to_zero); - NIR_PASS(progress, nir, nir_opt_shrink_vectors); - NIR_PASS(progress, nir, nir_opt_loop_unroll); - } while (progress); + NIR_PASS(progress, nir, nir_opt_shrink_vectors); + NIR_PASS(progress, nir, nir_opt_loop_unroll); + } while (progress); - /* TODO: Why is 64-bit getting rematerialized? - * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */ - NIR_PASS(progress, nir, nir_lower_int64); + /* TODO: Why is 64-bit getting rematerialized? + * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */ + NIR_PASS(progress, nir, nir_lower_int64); - /* We need to cleanup after each iteration of late algebraic - * optimizations, since otherwise NIR can produce weird edge cases - * (like fneg of a constant) which we don't handle */ - bool late_algebraic = true; - while (late_algebraic) { - late_algebraic = false; - NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_cse); - } + /* We need to cleanup after each iteration of late algebraic + * optimizations, since otherwise NIR can produce weird edge cases + * (like fneg of a constant) which we don't handle */ + bool late_algebraic = true; + while (late_algebraic) { + late_algebraic = false; + NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_cse); + } - /* This opt currently helps on Bifrost but not Valhall */ - if (gpu_id < 0x9000) - NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise); + /* This opt currently helps on Bifrost but not Valhall */ + if (gpu_id < 0x9000) + NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise); - NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); - NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL); - NIR_PASS(progress, nir, nir_lower_bool_to_bitsize); + NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); + NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL); + NIR_PASS(progress, nir, nir_lower_bool_to_bitsize); - /* Prepass to simplify instruction selection */ - late_algebraic = false; - NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late); + /* Prepass to simplify instruction selection */ + late_algebraic = false; + NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late); - while (late_algebraic) { - late_algebraic = false; - NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_cse); - } + while (late_algebraic) { + late_algebraic = false; + NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_cse); + } - NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); - NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); + NIR_PASS(progress, nir, nir_opt_dce); - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, nir_shader_instructions_pass, - bifrost_nir_lower_blend_components, - nir_metadata_block_index | nir_metadata_dominance, - NULL); - } + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(nir, nir_shader_instructions_pass, + bifrost_nir_lower_blend_components, + nir_metadata_block_index | nir_metadata_dominance, NULL); + } - /* Backend scheduler is purely local, so do some global optimizations - * to reduce register pressure. */ - nir_move_options move_all = - nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | - nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; + /* Backend scheduler is purely local, so do some global optimizations + * to reduce register pressure. */ + nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | + nir_move_load_input | nir_move_comparisons | + nir_move_copies | nir_move_load_ssbo; - NIR_PASS_V(nir, nir_opt_sink, move_all); - NIR_PASS_V(nir, nir_opt_move, move_all); + NIR_PASS_V(nir, nir_opt_sink, move_all); + NIR_PASS_V(nir, nir_opt_move, move_all); - /* We might lower attribute, varying, and image indirects. Use the - * gathered info to skip the extra analysis in the happy path. */ - bool any_indirects = - nir->info.inputs_read_indirectly || - nir->info.outputs_accessed_indirectly || - nir->info.patch_inputs_read_indirectly || - nir->info.patch_outputs_accessed_indirectly || - nir->info.images_used[0]; + /* We might lower attribute, varying, and image indirects. Use the + * gathered info to skip the extra analysis in the happy path. */ + bool any_indirects = nir->info.inputs_read_indirectly || + nir->info.outputs_accessed_indirectly || + nir->info.patch_inputs_read_indirectly || + nir->info.patch_outputs_accessed_indirectly || + nir->info.images_used[0]; - if (any_indirects) { - nir_convert_to_lcssa(nir, true, true); - NIR_PASS_V(nir, nir_divergence_analysis); - NIR_PASS_V(nir, bi_lower_divergent_indirects, - pan_subgroup_size(gpu_id >> 12)); - } + if (any_indirects) { + nir_convert_to_lcssa(nir, true, true); + NIR_PASS_V(nir, nir_divergence_analysis); + NIR_PASS_V(nir, bi_lower_divergent_indirects, + pan_subgroup_size(gpu_id >> 12)); + } } static void bi_opt_post_ra(bi_context *ctx) { - bi_foreach_instr_global_safe(ctx, ins) { - if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0])) - bi_remove_instruction(ins); - } + bi_foreach_instr_global_safe(ctx, ins) { + if (ins->op == BI_OPCODE_MOV_I32 && + bi_is_equiv(ins->dest[0], ins->src[0])) + bi_remove_instruction(ins); + } } /* Dead code elimination for branches at the end of a block - only one branch @@ -4714,52 +4676,53 @@ bi_opt_post_ra(bi_context *ctx) static void bi_lower_branch(bi_context *ctx, bi_block *block) { - bool cull_terminal = (ctx->arch <= 8); - bool branched = false; + bool cull_terminal = (ctx->arch <= 8); + bool branched = false; - bi_foreach_instr_in_block_safe(block, ins) { - if (!ins->branch_target) continue; + bi_foreach_instr_in_block_safe(block, ins) { + if (!ins->branch_target) + continue; - if (branched) { - bi_remove_instruction(ins); - continue; - } + if (branched) { + bi_remove_instruction(ins); + continue; + } - branched = true; + branched = true; - if (!bi_is_terminal_block(ins->branch_target)) - continue; + if (!bi_is_terminal_block(ins->branch_target)) + continue; - if (cull_terminal) - ins->branch_target = NULL; - else if (ins->branch_target) - ins->branch_target->needs_nop = true; - } + if (cull_terminal) + ins->branch_target = NULL; + else if (ins->branch_target) + ins->branch_target->needs_nop = true; + } } static void bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset) { - unsigned final_clause = bi_pack(ctx, binary); + unsigned final_clause = bi_pack(ctx, binary); - /* If we need to wait for ATEST or BLEND in the first clause, pass the - * corresponding bits through to the renderer state descriptor */ - bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); - bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL); + /* If we need to wait for ATEST or BLEND in the first clause, pass the + * corresponding bits through to the renderer state descriptor */ + bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); + bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL); - unsigned first_deps = first_clause ? first_clause->dependencies : 0; - ctx->info.bifrost->wait_6 = (first_deps & (1 << 6)); - ctx->info.bifrost->wait_7 = (first_deps & (1 << 7)); + unsigned first_deps = first_clause ? first_clause->dependencies : 0; + ctx->info.bifrost->wait_6 = (first_deps & (1 << 6)); + ctx->info.bifrost->wait_7 = (first_deps & (1 << 7)); - /* Pad the shader with enough zero bytes to trick the prefetcher, - * unless we're compiling an empty shader (in which case we don't pad - * so the size remains 0) */ - unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause; + /* Pad the shader with enough zero bytes to trick the prefetcher, + * unless we're compiling an empty shader (in which case we don't pad + * so the size remains 0) */ + unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause; - if (binary->size - offset) { - memset(util_dynarray_grow(binary, uint8_t, prefetch_size), - 0, prefetch_size); - } + if (binary->size - offset) { + memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0, + prefetch_size); + } } /* @@ -4780,133 +4743,132 @@ bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset) static bool bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data) { - uint64_t *mask = data; + uint64_t *mask = data; - if (instr->type != nir_instr_type_tex) - return false; + if (instr->type != nir_instr_type_tex) + return false; - nir_tex_instr *tex = nir_instr_as_tex(instr); + nir_tex_instr *tex = nir_instr_as_tex(instr); - int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); - if (coord_idx < 0) - return false; + int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); + if (coord_idx < 0) + return false; - nir_src src = tex->src[coord_idx].src; - nir_ssa_scalar x = nir_ssa_scalar_resolved(src.ssa, 0); - nir_ssa_scalar y = nir_ssa_scalar_resolved(src.ssa, 1); + nir_src src = tex->src[coord_idx].src; + nir_ssa_scalar x = nir_ssa_scalar_resolved(src.ssa, 0); + nir_ssa_scalar y = nir_ssa_scalar_resolved(src.ssa, 1); - if (x.def != y.def) - return false; + if (x.def != y.def) + return false; - nir_instr *parent = x.def->parent_instr; + nir_instr *parent = x.def->parent_instr; - if (parent->type != nir_instr_type_intrinsic) - return false; + if (parent->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); - if (intr->intrinsic != nir_intrinsic_load_interpolated_input) - return false; + if (intr->intrinsic != nir_intrinsic_load_interpolated_input) + return false; - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - *mask |= BITFIELD64_BIT(sem.location); - return false; + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + *mask |= BITFIELD64_BIT(sem.location); + return false; } static uint64_t bi_fp32_varying_mask(nir_shader *nir) { - uint64_t mask = 0; + uint64_t mask = 0; - assert(nir->info.stage == MESA_SHADER_FRAGMENT); + assert(nir->info.stage == MESA_SHADER_FRAGMENT); - nir_foreach_shader_in_variable(var, nir) { - if (var->data.interpolation == INTERP_MODE_FLAT) - mask |= BITFIELD64_BIT(var->data.location); - } + nir_foreach_shader_in_variable(var, nir) { + if (var->data.interpolation == INTERP_MODE_FLAT) + mask |= BITFIELD64_BIT(var->data.location); + } - nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, &mask); + nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, + &mask); - return mask; + return mask; } static void bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) { - /* Lower gl_Position pre-optimisation, but after lowering vars to ssa - * (so we don't accidentally duplicate the epilogue since mesa/st has - * messed with our I/O quite a bit already) */ + /* Lower gl_Position pre-optimisation, but after lowering vars to ssa + * (so we don't accidentally duplicate the epilogue since mesa/st has + * messed with our I/O quite a bit already) */ - NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); - if (nir->info.stage == MESA_SHADER_VERTEX) { - NIR_PASS_V(nir, nir_lower_viewport_transform); - NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0); + if (nir->info.stage == MESA_SHADER_VERTEX) { + NIR_PASS_V(nir, nir_lower_viewport_transform); + NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0); - nir_variable *psiz = nir_find_variable_with_location(nir, - nir_var_shader_out, - VARYING_SLOT_PSIZ); - if (psiz != NULL) - psiz->data.precision = GLSL_PRECISION_MEDIUM; - } + nir_variable *psiz = nir_find_variable_with_location( + nir, nir_var_shader_out, VARYING_SLOT_PSIZ); + if (psiz != NULL) + psiz->data.precision = GLSL_PRECISION_MEDIUM; + } - /* Get rid of any global vars before we lower to scratch. */ - NIR_PASS_V(nir, nir_lower_global_vars_to_local); + /* Get rid of any global vars before we lower to scratch. */ + NIR_PASS_V(nir, nir_lower_global_vars_to_local); - /* Valhall introduces packed thread local storage, which improves cache - * locality of TLS access. However, access to packed TLS cannot - * straddle 16-byte boundaries. As such, when packed TLS is in use - * (currently unconditional for Valhall), we force vec4 alignment for - * scratch access. - */ - bool packed_tls = (gpu_id >= 0x9000); + /* Valhall introduces packed thread local storage, which improves cache + * locality of TLS access. However, access to packed TLS cannot + * straddle 16-byte boundaries. As such, when packed TLS is in use + * (currently unconditional for Valhall), we force vec4 alignment for + * scratch access. + */ + bool packed_tls = (gpu_id >= 0x9000); - /* Lower large arrays to scratch and small arrays to bcsel */ - NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256, - packed_tls ? - glsl_get_vec4_size_align_bytes : - glsl_get_natural_size_align_bytes); - NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0); + /* Lower large arrays to scratch and small arrays to bcsel */ + NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256, + packed_tls ? glsl_get_vec4_size_align_bytes + : glsl_get_natural_size_align_bytes); + NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0); - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_lower_var_copies); - NIR_PASS_V(nir, nir_lower_vars_to_ssa); - NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, - glsl_type_size, 0); + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + glsl_type_size, 0); - /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for - * offsets it could figure out are constant. Do some constant folding - * before bifrost_nir_lower_store_component below. - */ - NIR_PASS_V(nir, nir_opt_constant_folding); + /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for + * offsets it could figure out are constant. Do some constant folding + * before bifrost_nir_lower_store_component below. + */ + NIR_PASS_V(nir, nir_opt_constant_folding); - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, nir_lower_mediump_io, - nir_var_shader_in | nir_var_shader_out, - ~bi_fp32_varying_mask(nir), false); - } else if (nir->info.stage == MESA_SHADER_VERTEX) { - if (gpu_id >= 0x9000) { - NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, - BITFIELD64_BIT(VARYING_SLOT_PSIZ), false); - } + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(nir, nir_lower_mediump_io, + nir_var_shader_in | nir_var_shader_out, + ~bi_fp32_varying_mask(nir), false); + } else if (nir->info.stage == MESA_SHADER_VERTEX) { + if (gpu_id >= 0x9000) { + NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, + BITFIELD64_BIT(VARYING_SLOT_PSIZ), false); + } - NIR_PASS_V(nir, pan_nir_lower_store_component); - } + NIR_PASS_V(nir, pan_nir_lower_store_component); + } - NIR_PASS_V(nir, nir_lower_ssbo); - NIR_PASS_V(nir, pan_nir_lower_zs_store); - NIR_PASS_V(nir, pan_lower_sample_pos); - NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL); - NIR_PASS_V(nir, nir_lower_64bit_phis); + NIR_PASS_V(nir, nir_lower_ssbo); + NIR_PASS_V(nir, pan_nir_lower_zs_store); + NIR_PASS_V(nir, pan_lower_sample_pos); + NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL); + NIR_PASS_V(nir, nir_lower_64bit_phis); - if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) { - NIR_PASS_V(nir, nir_io_add_const_offset_to_base, - nir_var_shader_in | nir_var_shader_out); - NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info); - NIR_PASS_V(nir, pan_lower_xfb); - } + if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) { + NIR_PASS_V(nir, nir_io_add_const_offset_to_base, + nir_var_shader_in | nir_var_shader_out); + NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info); + NIR_PASS_V(nir, pan_lower_xfb); + } - bi_optimize_nir(nir, gpu_id, is_blend); + bi_optimize_nir(nir, gpu_id, is_blend); } static bi_context * @@ -4914,271 +4876,266 @@ bi_compile_variant_nir(nir_shader *nir, const struct panfrost_compile_inputs *inputs, struct util_dynarray *binary, struct hash_table_u64 *sysval_to_id, - struct bi_shader_info info, - enum bi_idvs_mode idvs) + struct bi_shader_info info, enum bi_idvs_mode idvs) { - bi_context *ctx = rzalloc(NULL, bi_context); + bi_context *ctx = rzalloc(NULL, bi_context); - /* There may be another program in the dynarray, start at the end */ - unsigned offset = binary->size; + /* There may be another program in the dynarray, start at the end */ + unsigned offset = binary->size; - ctx->sysval_to_id = sysval_to_id; - ctx->inputs = inputs; - ctx->nir = nir; - ctx->stage = nir->info.stage; - ctx->quirks = bifrost_get_quirks(inputs->gpu_id); - ctx->arch = inputs->gpu_id >> 12; - ctx->info = info; - ctx->idvs = idvs; - ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs; + ctx->sysval_to_id = sysval_to_id; + ctx->inputs = inputs; + ctx->nir = nir; + ctx->stage = nir->info.stage; + ctx->quirks = bifrost_get_quirks(inputs->gpu_id); + ctx->arch = inputs->gpu_id >> 12; + ctx->info = info; + ctx->idvs = idvs; + ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs; - if (idvs != BI_IDVS_NONE) { - /* Specializing shaders for IDVS is destructive, so we need to - * clone. However, the last (second) IDVS shader does not need - * to be preserved so we can skip cloning that one. - */ - if (offset == 0) - ctx->nir = nir = nir_shader_clone(ctx, nir); + if (idvs != BI_IDVS_NONE) { + /* Specializing shaders for IDVS is destructive, so we need to + * clone. However, the last (second) IDVS shader does not need + * to be preserved so we can skip cloning that one. + */ + if (offset == 0) + ctx->nir = nir = nir_shader_clone(ctx, nir); - NIR_PASS_V(nir, nir_shader_instructions_pass, - bifrost_nir_specialize_idvs, - nir_metadata_block_index | nir_metadata_dominance, - &idvs); + NIR_PASS_V(nir, nir_shader_instructions_pass, bifrost_nir_specialize_idvs, + nir_metadata_block_index | nir_metadata_dominance, &idvs); - /* After specializing, clean up the mess */ - bool progress = true; + /* After specializing, clean up the mess */ + bool progress = true; - while (progress) { - progress = false; + while (progress) { + progress = false; - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_dead_cf); - } - } + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_dead_cf); + } + } - /* If nothing is pushed, all UBOs need to be uploaded */ - ctx->ubo_mask = ~0; + /* If nothing is pushed, all UBOs need to be uploaded */ + ctx->ubo_mask = ~0; - list_inithead(&ctx->blocks); + list_inithead(&ctx->blocks); - bool skip_internal = nir->info.internal; - skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL); + bool skip_internal = nir->info.internal; + skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL); - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { - nir_print_shader(nir, stdout); - } + if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { + nir_print_shader(nir, stdout); + } - ctx->allocated_vec = _mesa_hash_table_u64_create(ctx); + ctx->allocated_vec = _mesa_hash_table_u64_create(ctx); - nir_foreach_function(func, nir) { - if (!func->impl) - continue; + nir_foreach_function(func, nir) { + if (!func->impl) + continue; - nir_index_blocks(func->impl); + nir_index_blocks(func->impl); - ctx->indexed_nir_blocks = - rzalloc_array(ctx, bi_block *, func->impl->num_blocks); + ctx->indexed_nir_blocks = + rzalloc_array(ctx, bi_block *, func->impl->num_blocks); - ctx->ssa_alloc += func->impl->ssa_alloc; - ctx->reg_alloc += func->impl->reg_alloc; + ctx->ssa_alloc += func->impl->ssa_alloc; + ctx->reg_alloc += func->impl->reg_alloc; - emit_cf_list(ctx, &func->impl->body); - bi_emit_phis_deferred(ctx); - break; /* TODO: Multi-function shaders */ - } + emit_cf_list(ctx, &func->impl->body); + bi_emit_phis_deferred(ctx); + break; /* TODO: Multi-function shaders */ + } - /* Index blocks now that we're done emitting */ - bi_foreach_block(ctx, block) { - block->index = ctx->num_blocks++; - } + /* Index blocks now that we're done emitting */ + bi_foreach_block(ctx, block) { + block->index = ctx->num_blocks++; + } - bi_validate(ctx, "NIR -> BIR"); + bi_validate(ctx, "NIR -> BIR"); - /* If the shader doesn't write any colour or depth outputs, it may - * still need an ATEST at the very end! */ - bool need_dummy_atest = - (ctx->stage == MESA_SHADER_FRAGMENT) && - !ctx->emitted_atest && - !bi_skip_atest(ctx, false); + /* If the shader doesn't write any colour or depth outputs, it may + * still need an ATEST at the very end! */ + bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) && + !ctx->emitted_atest && !bi_skip_atest(ctx, false); - if (need_dummy_atest) { - bi_block *end = list_last_entry(&ctx->blocks, bi_block, link); - bi_builder b = bi_init_builder(ctx, bi_after_block(end)); - bi_emit_atest(&b, bi_zero()); - } + if (need_dummy_atest) { + bi_block *end = list_last_entry(&ctx->blocks, bi_block, link); + bi_builder b = bi_init_builder(ctx, bi_after_block(end)); + bi_emit_atest(&b, bi_zero()); + } - bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT); + bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT); - /* Runs before constant folding */ - bi_lower_swizzle(ctx); - bi_validate(ctx, "Early lowering"); + /* Runs before constant folding */ + bi_lower_swizzle(ctx); + bi_validate(ctx, "Early lowering"); - /* Runs before copy prop */ - if (optimize && !ctx->inputs->no_ubo_to_push) { - bi_opt_push_ubo(ctx); - } + /* Runs before copy prop */ + if (optimize && !ctx->inputs->no_ubo_to_push) { + bi_opt_push_ubo(ctx); + } - if (likely(optimize)) { - bi_opt_copy_prop(ctx); + if (likely(optimize)) { + bi_opt_copy_prop(ctx); - while (bi_opt_constant_fold(ctx)) - bi_opt_copy_prop(ctx); + while (bi_opt_constant_fold(ctx)) + bi_opt_copy_prop(ctx); - bi_opt_mod_prop_forward(ctx); - bi_opt_mod_prop_backward(ctx); + bi_opt_mod_prop_forward(ctx); + bi_opt_mod_prop_backward(ctx); - /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after - * mod_prop_backward to fuse VAR_TEX */ - if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT && - !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) { - bi_opt_dead_code_eliminate(ctx); - bi_opt_message_preload(ctx); - bi_opt_copy_prop(ctx); - } + /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after + * mod_prop_backward to fuse VAR_TEX */ + if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT && + !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) { + bi_opt_dead_code_eliminate(ctx); + bi_opt_message_preload(ctx); + bi_opt_copy_prop(ctx); + } - bi_opt_dead_code_eliminate(ctx); - bi_opt_cse(ctx); - bi_opt_dead_code_eliminate(ctx); - if (!ctx->inputs->no_ubo_to_push) - bi_opt_reorder_push(ctx); - bi_validate(ctx, "Optimization passes"); - } + bi_opt_dead_code_eliminate(ctx); + bi_opt_cse(ctx); + bi_opt_dead_code_eliminate(ctx); + if (!ctx->inputs->no_ubo_to_push) + bi_opt_reorder_push(ctx); + bi_validate(ctx, "Optimization passes"); + } - bi_lower_opt_instructions(ctx); + bi_lower_opt_instructions(ctx); - if (ctx->arch >= 9) { - va_optimize(ctx); - va_lower_isel(ctx); + if (ctx->arch >= 9) { + va_optimize(ctx); + va_lower_isel(ctx); - bi_foreach_instr_global_safe(ctx, I) { - /* Phis become single moves so shouldn't be affected */ - if (I->op == BI_OPCODE_PHI) - continue; + bi_foreach_instr_global_safe(ctx, I) { + /* Phis become single moves so shouldn't be affected */ + if (I->op == BI_OPCODE_PHI) + continue; - va_lower_constants(ctx, I); + va_lower_constants(ctx, I); - bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - va_repair_fau(&b, I); - } + bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); + va_repair_fau(&b, I); + } - /* We need to clean up after constant lowering */ - if (likely(optimize)) { - bi_opt_cse(ctx); - bi_opt_dead_code_eliminate(ctx); - } + /* We need to clean up after constant lowering */ + if (likely(optimize)) { + bi_opt_cse(ctx); + bi_opt_dead_code_eliminate(ctx); + } - bi_validate(ctx, "Valhall passes"); - } + bi_validate(ctx, "Valhall passes"); + } - bi_foreach_block(ctx, block) { - bi_lower_branch(ctx, block); - } + bi_foreach_block(ctx, block) { + bi_lower_branch(ctx, block); + } - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) - bi_print_shader(ctx, stdout); + if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) + bi_print_shader(ctx, stdout); - /* Analyze before register allocation to avoid false dependencies. The - * skip bit is a function of only the data flow graph and is invariant - * under valid scheduling. Helpers are only defined for fragment - * shaders, so this analysis is only required in fragment shaders. - */ - if (ctx->stage == MESA_SHADER_FRAGMENT) - bi_analyze_helper_requirements(ctx); + /* Analyze before register allocation to avoid false dependencies. The + * skip bit is a function of only the data flow graph and is invariant + * under valid scheduling. Helpers are only defined for fragment + * shaders, so this analysis is only required in fragment shaders. + */ + if (ctx->stage == MESA_SHADER_FRAGMENT) + bi_analyze_helper_requirements(ctx); - /* Fuse TEXC after analyzing helper requirements so the analysis - * doesn't have to know about dual textures */ - if (likely(optimize)) { - bi_opt_fuse_dual_texture(ctx); - } + /* Fuse TEXC after analyzing helper requirements so the analysis + * doesn't have to know about dual textures */ + if (likely(optimize)) { + bi_opt_fuse_dual_texture(ctx); + } - /* Lower FAU after fusing dual texture, because fusing dual texture - * creates new immediates that themselves may need lowering. - */ - if (ctx->arch <= 8) { - bi_lower_fau(ctx); - } + /* Lower FAU after fusing dual texture, because fusing dual texture + * creates new immediates that themselves may need lowering. + */ + if (ctx->arch <= 8) { + bi_lower_fau(ctx); + } - /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */ - if (likely(optimize)) { - bi_opt_cse(ctx); - bi_opt_dead_code_eliminate(ctx); - } + /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */ + if (likely(optimize)) { + bi_opt_cse(ctx); + bi_opt_dead_code_eliminate(ctx); + } - bi_validate(ctx, "Late lowering"); + bi_validate(ctx, "Late lowering"); - if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) { - bi_pressure_schedule(ctx); - bi_validate(ctx, "Pre-RA scheduling"); - } + if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) { + bi_pressure_schedule(ctx); + bi_validate(ctx, "Pre-RA scheduling"); + } - bi_register_allocate(ctx); + bi_register_allocate(ctx); - if (likely(optimize)) - bi_opt_post_ra(ctx); + if (likely(optimize)) + bi_opt_post_ra(ctx); - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) - bi_print_shader(ctx, stdout); + if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) + bi_print_shader(ctx, stdout); - if (ctx->arch >= 9) { - va_assign_slots(ctx); - va_insert_flow_control_nops(ctx); - va_merge_flow(ctx); - va_mark_last(ctx); - } else { - bi_schedule(ctx); - bi_assign_scoreboard(ctx); + if (ctx->arch >= 9) { + va_assign_slots(ctx); + va_insert_flow_control_nops(ctx); + va_merge_flow(ctx); + va_mark_last(ctx); + } else { + bi_schedule(ctx); + bi_assign_scoreboard(ctx); - /* Analyze after scheduling since we depend on instruction - * order. Valhall calls as part of va_insert_flow_control_nops, - * as the handling for clauses differs from instructions. - */ - bi_analyze_helper_terminate(ctx); - bi_mark_clauses_td(ctx); - } + /* Analyze after scheduling since we depend on instruction + * order. Valhall calls as part of va_insert_flow_control_nops, + * as the handling for clauses differs from instructions. + */ + bi_analyze_helper_terminate(ctx); + bi_mark_clauses_td(ctx); + } - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) - bi_print_shader(ctx, stdout); + if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) + bi_print_shader(ctx, stdout); - if (ctx->arch <= 8) { - bi_pack_clauses(ctx, binary, offset); - } else { - bi_pack_valhall(ctx, binary); - } + if (ctx->arch <= 8) { + bi_pack_clauses(ctx, binary, offset); + } else { + bi_pack_valhall(ctx, binary); + } - if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { - if (ctx->arch <= 8) { - disassemble_bifrost(stdout, binary->data + offset, - binary->size - offset, - bifrost_debug & BIFROST_DBG_VERBOSE); - } else { - disassemble_valhall(stdout, binary->data + offset, - binary->size - offset, - bifrost_debug & BIFROST_DBG_VERBOSE); - } + if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { + if (ctx->arch <= 8) { + disassemble_bifrost(stdout, binary->data + offset, + binary->size - offset, + bifrost_debug & BIFROST_DBG_VERBOSE); + } else { + disassemble_valhall(stdout, binary->data + offset, + binary->size - offset, + bifrost_debug & BIFROST_DBG_VERBOSE); + } - fflush(stdout); - } + fflush(stdout); + } - if (!skip_internal && - ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) { - char *shaderdb; + if (!skip_internal && + ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) { + char *shaderdb; - if (ctx->arch >= 9) { - shaderdb = va_print_stats(ctx, binary->size - offset); - } else { - shaderdb = bi_print_stats(ctx, binary->size - offset); - } + if (ctx->arch >= 9) { + shaderdb = va_print_stats(ctx, binary->size - offset); + } else { + shaderdb = bi_print_stats(ctx, binary->size - offset); + } - if (bifrost_debug & BIFROST_DBG_SHADERDB) - fprintf(stderr, "SHADER-DB: %s\n", shaderdb); + if (bifrost_debug & BIFROST_DBG_SHADERDB) + fprintf(stderr, "SHADER-DB: %s\n", shaderdb); - if (inputs->debug) - util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb); + if (inputs->debug) + util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb); - ralloc_free(shaderdb); - } + ralloc_free(shaderdb); + } - return ctx; + return ctx; } static void @@ -5186,114 +5143,113 @@ bi_compile_variant(nir_shader *nir, const struct panfrost_compile_inputs *inputs, struct util_dynarray *binary, struct hash_table_u64 *sysval_to_id, - struct pan_shader_info *info, - enum bi_idvs_mode idvs) + struct pan_shader_info *info, enum bi_idvs_mode idvs) { - struct bi_shader_info local_info = { - .push = &info->push, - .bifrost = &info->bifrost, - .tls_size = info->tls_size, - .sysvals = &info->sysvals, - .push_offset = info->push.count, - }; + struct bi_shader_info local_info = { + .push = &info->push, + .bifrost = &info->bifrost, + .tls_size = info->tls_size, + .sysvals = &info->sysvals, + .push_offset = info->push.count, + }; - unsigned offset = binary->size; + unsigned offset = binary->size; - /* If there is no position shader (gl_Position is not written), then - * there is no need to build a varying shader either. This case is hit - * for transform feedback only vertex shaders which only make sense with - * rasterizer discard. - */ - if ((offset == 0) && (idvs == BI_IDVS_VARYING)) - return; + /* If there is no position shader (gl_Position is not written), then + * there is no need to build a varying shader either. This case is hit + * for transform feedback only vertex shaders which only make sense with + * rasterizer discard. + */ + if ((offset == 0) && (idvs == BI_IDVS_VARYING)) + return; - /* Software invariant: Only a secondary shader can appear at a nonzero - * offset, to keep the ABI simple. */ - assert((offset == 0) ^ (idvs == BI_IDVS_VARYING)); + /* Software invariant: Only a secondary shader can appear at a nonzero + * offset, to keep the ABI simple. */ + assert((offset == 0) ^ (idvs == BI_IDVS_VARYING)); - bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, local_info, idvs); + bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, + local_info, idvs); - /* A register is preloaded <==> it is live before the first block */ - bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); - uint64_t preload = first_block->reg_live_in; + /* A register is preloaded <==> it is live before the first block */ + bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); + uint64_t preload = first_block->reg_live_in; - /* If multisampling is used with a blend shader, the blend shader needs - * to access the sample coverage mask in r60 and the sample ID in r61. - * Blend shaders run in the same context as fragment shaders, so if a - * blend shader could run, we need to preload these registers - * conservatively. There is believed to be little cost to doing so, so - * do so always to avoid variants of the preload descriptor. - * - * We only do this on Valhall, as Bifrost has to update the RSD for - * multisampling w/ blend shader anyway, so this is handled in the - * driver. We could unify the paths if the cost is acceptable. - */ - if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9) - preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61); + /* If multisampling is used with a blend shader, the blend shader needs + * to access the sample coverage mask in r60 and the sample ID in r61. + * Blend shaders run in the same context as fragment shaders, so if a + * blend shader could run, we need to preload these registers + * conservatively. There is believed to be little cost to doing so, so + * do so always to avoid variants of the preload descriptor. + * + * We only do this on Valhall, as Bifrost has to update the RSD for + * multisampling w/ blend shader anyway, so this is handled in the + * driver. We could unify the paths if the cost is acceptable. + */ + if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9) + preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61); - info->ubo_mask |= ctx->ubo_mask; - info->tls_size = MAX2(info->tls_size, ctx->info.tls_size); + info->ubo_mask |= ctx->ubo_mask; + info->tls_size = MAX2(info->tls_size, ctx->info.tls_size); - if (idvs == BI_IDVS_VARYING) { - info->vs.secondary_enable = (binary->size > offset); - info->vs.secondary_offset = offset; - info->vs.secondary_preload = preload; - info->vs.secondary_work_reg_count = ctx->info.work_reg_count; - } else { - info->preload = preload; - info->work_reg_count = ctx->info.work_reg_count; - } + if (idvs == BI_IDVS_VARYING) { + info->vs.secondary_enable = (binary->size > offset); + info->vs.secondary_offset = offset; + info->vs.secondary_preload = preload; + info->vs.secondary_work_reg_count = ctx->info.work_reg_count; + } else { + info->preload = preload; + info->work_reg_count = ctx->info.work_reg_count; + } - if (idvs == BI_IDVS_POSITION && - !nir->info.internal && - nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) { - /* Find the psiz write */ - bi_instr *write = NULL; + if (idvs == BI_IDVS_POSITION && !nir->info.internal && + nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) { + /* Find the psiz write */ + bi_instr *write = NULL; - bi_foreach_instr_global(ctx, I) { - if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) { - write = I; - break; - } - } + bi_foreach_instr_global(ctx, I) { + if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) { + write = I; + break; + } + } - assert(write != NULL); + assert(write != NULL); - /* NOP it out, preserving its flow control. TODO: maybe DCE */ - if (write->flow) { - bi_builder b = bi_init_builder(ctx, bi_before_instr(write)); - bi_instr *nop = bi_nop(&b); - nop->flow = write->flow; - } + /* NOP it out, preserving its flow control. TODO: maybe DCE */ + if (write->flow) { + bi_builder b = bi_init_builder(ctx, bi_before_instr(write)); + bi_instr *nop = bi_nop(&b); + nop->flow = write->flow; + } - bi_remove_instruction(write); + bi_remove_instruction(write); - info->vs.no_psiz_offset = binary->size; - bi_pack_valhall(ctx, binary); - } + info->vs.no_psiz_offset = binary->size; + bi_pack_valhall(ctx, binary); + } - ralloc_free(ctx); + ralloc_free(ctx); } /* Decide if Index-Driven Vertex Shading should be used for a given shader */ static bool bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs) { - /* Opt-out */ - if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS) - return false; + /* Opt-out */ + if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS) + return false; - /* IDVS splits up vertex shaders, not defined on other shader stages */ - if (nir->info.stage != MESA_SHADER_VERTEX) - return false; + /* IDVS splits up vertex shaders, not defined on other shader stages */ + if (nir->info.stage != MESA_SHADER_VERTEX) + return false; - /* Bifrost cannot write gl_PointSize during IDVS */ - if ((inputs->gpu_id < 0x9000) && - nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) - return false; + /* Bifrost cannot write gl_PointSize during IDVS */ + if ((inputs->gpu_id < 0x9000) && + nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) + return false; - /* Otherwise, IDVS is usually better */ - return true; + /* Otherwise, IDVS is usually better */ + return true; } void @@ -5302,39 +5258,38 @@ bifrost_compile_shader_nir(nir_shader *nir, struct util_dynarray *binary, struct pan_shader_info *info) { - bifrost_debug = debug_get_option_bifrost_debug(); + bifrost_debug = debug_get_option_bifrost_debug(); - bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend); - struct hash_table_u64 *sysval_to_id = - panfrost_init_sysvals(&info->sysvals, - inputs->fixed_sysval_layout, - NULL); + bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend); + struct hash_table_u64 *sysval_to_id = + panfrost_init_sysvals(&info->sysvals, inputs->fixed_sysval_layout, NULL); - info->tls_size = nir->scratch_size; - info->vs.idvs = bi_should_idvs(nir, inputs); + info->tls_size = nir->scratch_size; + info->vs.idvs = bi_should_idvs(nir, inputs); - pan_nir_collect_varyings(nir, info); + pan_nir_collect_varyings(nir, info); - if (info->vs.idvs) { - bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_POSITION); - bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_VARYING); - } else { - bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE); - } + if (info->vs.idvs) { + bi_compile_variant(nir, inputs, binary, sysval_to_id, info, + BI_IDVS_POSITION); + bi_compile_variant(nir, inputs, binary, sysval_to_id, info, + BI_IDVS_VARYING); + } else { + bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE); + } - if (gl_shader_stage_is_compute(nir->info.stage)) { - /* Workgroups may be merged if the structure of the workgroup is - * not software visible. This is true if neither shared memory - * nor barriers are used. The hardware may be able to optimize - * compute shaders that set this flag. - */ - info->cs.allow_merging_workgroups = - (nir->info.shared_size == 0) && - !nir->info.uses_control_barrier && - !nir->info.uses_memory_barrier; - } + if (gl_shader_stage_is_compute(nir->info.stage)) { + /* Workgroups may be merged if the structure of the workgroup is + * not software visible. This is true if neither shared memory + * nor barriers are used. The hardware may be able to optimize + * compute shaders that set this flag. + */ + info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) && + !nir->info.uses_control_barrier && + !nir->info.uses_memory_barrier; + } - info->ubo_mask &= (1 << nir->info.num_ubos) - 1; + info->ubo_mask &= (1 << nir->info.num_ubos) - 1; - _mesa_hash_table_u64_destroy(sysval_to_id); + _mesa_hash_table_u64_destroy(sysval_to_id); } diff --git a/src/panfrost/bifrost/bifrost_compile.h b/src/panfrost/bifrost/bifrost_compile.h index c23b51afee7..69ce3ac9511 100644 --- a/src/panfrost/bifrost/bifrost_compile.h +++ b/src/panfrost/bifrost/bifrost_compile.h @@ -25,73 +25,73 @@ #define __BIFROST_PUBLIC_H_ #include "compiler/nir/nir.h" -#include "util/u_dynarray.h" #include "panfrost/util/pan_ir.h" +#include "util/u_dynarray.h" -void -bifrost_compile_shader_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs, - struct util_dynarray *binary, - struct pan_shader_info *info); +void bifrost_compile_shader_nir(nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info); static const nir_shader_compiler_options bifrost_nir_options = { - .lower_scmp = true, - .lower_flrp16 = true, - .lower_flrp32 = true, - .lower_flrp64 = true, - .lower_ffract = true, - .lower_fmod = true, - .lower_fdiv = true, - .lower_isign = true, - .lower_find_lsb = true, - .lower_ifind_msb = true, - .lower_fdph = true, - .lower_fsqrt = true, + .lower_scmp = true, + .lower_flrp16 = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_ffract = true, + .lower_fmod = true, + .lower_fdiv = true, + .lower_isign = true, + .lower_find_lsb = true, + .lower_ifind_msb = true, + .lower_fdph = true, + .lower_fsqrt = true, - .lower_fsign = true, + .lower_fsign = true, - .lower_bitfield_insert_to_shifts = true, - .lower_bitfield_extract_to_shifts = true, - .lower_insert_byte = true, - .lower_rotate = true, + .lower_bitfield_insert_to_shifts = true, + .lower_bitfield_extract_to_shifts = true, + .lower_insert_byte = true, + .lower_rotate = true, - .lower_pack_half_2x16 = true, - .lower_pack_unorm_2x16 = true, - .lower_pack_snorm_2x16 = true, - .lower_pack_unorm_4x8 = true, - .lower_pack_snorm_4x8 = true, - .lower_unpack_half_2x16 = true, - .lower_unpack_unorm_2x16 = true, - .lower_unpack_snorm_2x16 = true, - .lower_unpack_unorm_4x8 = true, - .lower_unpack_snorm_4x8 = true, - .lower_pack_split = true, + .lower_pack_half_2x16 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_pack_snorm_4x8 = true, + .lower_unpack_half_2x16 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_unpack_snorm_4x8 = true, + .lower_pack_split = true, - .lower_doubles_options = nir_lower_dmod, - /* TODO: Don't lower supported 64-bit operations */ - .lower_int64_options = ~0, - /* TODO: Use IMULD on v7 */ - .lower_mul_high = true, - .lower_fisnormal = true, - .lower_uadd_carry = true, - .lower_usub_borrow = true, + .lower_doubles_options = nir_lower_dmod, + /* TODO: Don't lower supported 64-bit operations */ + .lower_int64_options = ~0, + /* TODO: Use IMULD on v7 */ + .lower_mul_high = true, + .lower_fisnormal = true, + .lower_uadd_carry = true, + .lower_usub_borrow = true, - .has_fsub = true, - .has_isub = true, - .vectorize_io = true, - .vectorize_vec2_16bit = true, - .fuse_ffma16 = true, - .fuse_ffma32 = true, - .fuse_ffma64 = true, - .use_interpolated_input_intrinsics = true, + .has_fsub = true, + .has_isub = true, + .vectorize_io = true, + .vectorize_vec2_16bit = true, + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, + .use_interpolated_input_intrinsics = true, - .lower_uniforms_to_ubo = true, + .lower_uniforms_to_ubo = true, - .has_cs_global_id = true, - .lower_cs_local_index_to_id = true, - .max_unroll_iterations = 32, - .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp), - .force_indirect_unrolling_sampler = true, + .has_cs_global_id = true, + .lower_cs_local_index_to_id = true, + .max_unroll_iterations = 32, + .force_indirect_unrolling = + (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp), + .force_indirect_unrolling_sampler = true, }; #endif diff --git a/src/panfrost/bifrost/bir.c b/src/panfrost/bifrost/bir.c index 92076f9c667..5836c5d4ff3 100644 --- a/src/panfrost/bifrost/bir.c +++ b/src/panfrost/bifrost/bir.c @@ -24,21 +24,21 @@ * Alyssa Rosenzweig */ -#include "compiler.h" #include "bi_builder.h" +#include "compiler.h" bool bi_has_arg(const bi_instr *ins, bi_index arg) { - if (!ins) - return false; + if (!ins) + return false; - bi_foreach_src(ins, s) { - if (bi_is_equiv(ins->src[s], arg)) - return true; - } + bi_foreach_src(ins, s) { + if (bi_is_equiv(ins->src[s], arg)) + return true; + } - return false; + return false; } /* Precondition: valid 16-bit or 32-bit register format. Returns whether it is @@ -48,131 +48,131 @@ bi_has_arg(const bi_instr *ins, bi_index arg) bool bi_is_regfmt_16(enum bi_register_format fmt) { - switch (fmt) { - case BI_REGISTER_FORMAT_F16: - case BI_REGISTER_FORMAT_S16: - case BI_REGISTER_FORMAT_U16: - return true; - case BI_REGISTER_FORMAT_F32: - case BI_REGISTER_FORMAT_S32: - case BI_REGISTER_FORMAT_U32: - case BI_REGISTER_FORMAT_AUTO: - return false; - default: - unreachable("Invalid register format"); - } + switch (fmt) { + case BI_REGISTER_FORMAT_F16: + case BI_REGISTER_FORMAT_S16: + case BI_REGISTER_FORMAT_U16: + return true; + case BI_REGISTER_FORMAT_F32: + case BI_REGISTER_FORMAT_S32: + case BI_REGISTER_FORMAT_U32: + case BI_REGISTER_FORMAT_AUTO: + return false; + default: + unreachable("Invalid register format"); + } } static unsigned bi_count_staging_registers(const bi_instr *ins) { - enum bi_sr_count count = bi_opcode_props[ins->op].sr_count; - unsigned vecsize = ins->vecsize + 1; /* XXX: off-by-one */ + enum bi_sr_count count = bi_opcode_props[ins->op].sr_count; + unsigned vecsize = ins->vecsize + 1; /* XXX: off-by-one */ - switch (count) { - case BI_SR_COUNT_0 ... BI_SR_COUNT_4: - return count; - case BI_SR_COUNT_FORMAT: - return bi_is_regfmt_16(ins->register_format) ? - DIV_ROUND_UP(vecsize, 2) : vecsize; - case BI_SR_COUNT_VECSIZE: - return vecsize; - case BI_SR_COUNT_SR_COUNT: - return ins->sr_count; - } + switch (count) { + case BI_SR_COUNT_0 ... BI_SR_COUNT_4: + return count; + case BI_SR_COUNT_FORMAT: + return bi_is_regfmt_16(ins->register_format) ? DIV_ROUND_UP(vecsize, 2) + : vecsize; + case BI_SR_COUNT_VECSIZE: + return vecsize; + case BI_SR_COUNT_SR_COUNT: + return ins->sr_count; + } - unreachable("Invalid sr_count"); + unreachable("Invalid sr_count"); } unsigned bi_count_read_registers(const bi_instr *ins, unsigned s) { - /* ATOM reads 1 but writes 2. Exception for ACMPXCHG */ - if (s == 0 && ins->op == BI_OPCODE_ATOM_RETURN_I32) - return (ins->atom_opc == BI_ATOM_OPC_ACMPXCHG) ? 2 : 1; - else if (s == 0 && bi_opcode_props[ins->op].sr_read) - return bi_count_staging_registers(ins); - else if (s == 4 && ins->op == BI_OPCODE_BLEND) - return ins->sr_count_2; /* Dual source blending */ - else if (s == 0 && ins->op == BI_OPCODE_SPLIT_I32) - return ins->nr_dests; - else - return 1; + /* ATOM reads 1 but writes 2. Exception for ACMPXCHG */ + if (s == 0 && ins->op == BI_OPCODE_ATOM_RETURN_I32) + return (ins->atom_opc == BI_ATOM_OPC_ACMPXCHG) ? 2 : 1; + else if (s == 0 && bi_opcode_props[ins->op].sr_read) + return bi_count_staging_registers(ins); + else if (s == 4 && ins->op == BI_OPCODE_BLEND) + return ins->sr_count_2; /* Dual source blending */ + else if (s == 0 && ins->op == BI_OPCODE_SPLIT_I32) + return ins->nr_dests; + else + return 1; } unsigned bi_count_write_registers(const bi_instr *ins, unsigned d) { - if (d == 0 && bi_opcode_props[ins->op].sr_write) { - switch (ins->op) { - case BI_OPCODE_TEXC: - case BI_OPCODE_TEXC_DUAL: - if (ins->sr_count_2) - return ins->sr_count; - else - return bi_is_regfmt_16(ins->register_format) ? 2 : 4; + if (d == 0 && bi_opcode_props[ins->op].sr_write) { + switch (ins->op) { + case BI_OPCODE_TEXC: + case BI_OPCODE_TEXC_DUAL: + if (ins->sr_count_2) + return ins->sr_count; + else + return bi_is_regfmt_16(ins->register_format) ? 2 : 4; - case BI_OPCODE_TEX_SINGLE: - case BI_OPCODE_TEX_FETCH: - case BI_OPCODE_TEX_GATHER: { - unsigned chans = util_bitcount(ins->write_mask); + case BI_OPCODE_TEX_SINGLE: + case BI_OPCODE_TEX_FETCH: + case BI_OPCODE_TEX_GATHER: { + unsigned chans = util_bitcount(ins->write_mask); - return bi_is_regfmt_16(ins->register_format) ? - DIV_ROUND_UP(chans, 2) : chans; - } + return bi_is_regfmt_16(ins->register_format) ? DIV_ROUND_UP(chans, 2) + : chans; + } - case BI_OPCODE_ACMPXCHG_I32: - /* Reads 2 but writes 1 */ - return 1; + case BI_OPCODE_ACMPXCHG_I32: + /* Reads 2 but writes 1 */ + return 1; - case BI_OPCODE_ATOM1_RETURN_I32: - /* Allow omitting the destination for plain ATOM1 */ - return bi_is_null(ins->dest[0]) ? 0 : ins->sr_count; - default: - return bi_count_staging_registers(ins); - } - } else if (ins->op == BI_OPCODE_SEG_ADD_I64) { - return 2; - } else if (ins->op == BI_OPCODE_TEXC_DUAL && d == 1) { - return ins->sr_count_2; - } else if (ins->op == BI_OPCODE_COLLECT_I32 && d == 0) { - return ins->nr_srcs; - } + case BI_OPCODE_ATOM1_RETURN_I32: + /* Allow omitting the destination for plain ATOM1 */ + return bi_is_null(ins->dest[0]) ? 0 : ins->sr_count; + default: + return bi_count_staging_registers(ins); + } + } else if (ins->op == BI_OPCODE_SEG_ADD_I64) { + return 2; + } else if (ins->op == BI_OPCODE_TEXC_DUAL && d == 1) { + return ins->sr_count_2; + } else if (ins->op == BI_OPCODE_COLLECT_I32 && d == 0) { + return ins->nr_srcs; + } - return 1; + return 1; } unsigned bi_writemask(const bi_instr *ins, unsigned d) { - unsigned mask = BITFIELD_MASK(bi_count_write_registers(ins, d)); - unsigned shift = ins->dest[d].offset; - return (mask << shift); + unsigned mask = BITFIELD_MASK(bi_count_write_registers(ins, d)); + unsigned shift = ins->dest[d].offset; + return (mask << shift); } bi_clause * bi_next_clause(bi_context *ctx, bi_block *block, bi_clause *clause) { - if (!block && !clause) - return NULL; + if (!block && !clause) + return NULL; - /* Try the first clause in this block if we're starting from scratch */ - if (!clause && !list_is_empty(&block->clauses)) - return list_first_entry(&block->clauses, bi_clause, link); + /* Try the first clause in this block if we're starting from scratch */ + if (!clause && !list_is_empty(&block->clauses)) + return list_first_entry(&block->clauses, bi_clause, link); - /* Try the next clause in this block */ - if (clause && clause->link.next != &block->clauses) - return list_first_entry(&(clause->link), bi_clause, link); + /* Try the next clause in this block */ + if (clause && clause->link.next != &block->clauses) + return list_first_entry(&(clause->link), bi_clause, link); - /* Try the next block, or the one after that if it's empty, etc .*/ - bi_block *next_block = bi_next_block(block); + /* Try the next block, or the one after that if it's empty, etc .*/ + bi_block *next_block = bi_next_block(block); - bi_foreach_block_from(ctx, next_block, block) { - if (!list_is_empty(&block->clauses)) - return list_first_entry(&block->clauses, bi_clause, link); - } + bi_foreach_block_from(ctx, next_block, block) { + if (!list_is_empty(&block->clauses)) + return list_first_entry(&block->clauses, bi_clause, link); + } - return NULL; + return NULL; } /* Does an instruction have a side effect not captured by its register @@ -184,41 +184,41 @@ bi_next_clause(bi_context *ctx, bi_block *block, bi_clause *clause) bool bi_side_effects(const bi_instr *I) { - if (bi_opcode_props[I->op].last) - return true; + if (bi_opcode_props[I->op].last) + return true; - switch (I->op) { - case BI_OPCODE_DISCARD_F32: - case BI_OPCODE_DISCARD_B32: - return true; - default: - break; - } + switch (I->op) { + case BI_OPCODE_DISCARD_F32: + case BI_OPCODE_DISCARD_B32: + return true; + default: + break; + } - switch (bi_opcode_props[I->op].message) { - case BIFROST_MESSAGE_NONE: - case BIFROST_MESSAGE_VARYING: - case BIFROST_MESSAGE_ATTRIBUTE: - case BIFROST_MESSAGE_TEX: - case BIFROST_MESSAGE_VARTEX: - case BIFROST_MESSAGE_LOAD: - case BIFROST_MESSAGE_64BIT: - return false; + switch (bi_opcode_props[I->op].message) { + case BIFROST_MESSAGE_NONE: + case BIFROST_MESSAGE_VARYING: + case BIFROST_MESSAGE_ATTRIBUTE: + case BIFROST_MESSAGE_TEX: + case BIFROST_MESSAGE_VARTEX: + case BIFROST_MESSAGE_LOAD: + case BIFROST_MESSAGE_64BIT: + return false; - case BIFROST_MESSAGE_STORE: - case BIFROST_MESSAGE_ATOMIC: - case BIFROST_MESSAGE_BARRIER: - case BIFROST_MESSAGE_BLEND: - case BIFROST_MESSAGE_Z_STENCIL: - case BIFROST_MESSAGE_ATEST: - case BIFROST_MESSAGE_JOB: - return true; + case BIFROST_MESSAGE_STORE: + case BIFROST_MESSAGE_ATOMIC: + case BIFROST_MESSAGE_BARRIER: + case BIFROST_MESSAGE_BLEND: + case BIFROST_MESSAGE_Z_STENCIL: + case BIFROST_MESSAGE_ATEST: + case BIFROST_MESSAGE_JOB: + return true; - case BIFROST_MESSAGE_TILE: - return (I->op != BI_OPCODE_LD_TILE); - } + case BIFROST_MESSAGE_TILE: + return (I->op != BI_OPCODE_LD_TILE); + } - unreachable("Invalid message type"); + unreachable("Invalid message type"); } /* Branch reconvergence is required when the execution mask may change @@ -230,10 +230,10 @@ bi_side_effects(const bi_instr *I) bool bi_reconverge_branches(bi_block *block) { - if (bi_num_successors(block) == 1) - return bi_num_predecessors(block->successors[0]) > 1; - else - return true; + if (bi_num_successors(block) == 1) + return bi_num_predecessors(block->successors[0]) > 1; + else + return true; } /* @@ -252,42 +252,41 @@ bi_reconverge_branches(bi_block *block) bool bi_can_replace_with_csel(bi_instr *I) { - return ((I->op == BI_OPCODE_MUX_I32) || (I->op == BI_OPCODE_MUX_V2I16)) && - (I->mux != BI_MUX_BIT) && - (I->src[0].swizzle == BI_SWIZZLE_H01) && - (I->src[1].swizzle == BI_SWIZZLE_H01) && - (I->src[2].swizzle == BI_SWIZZLE_H01); + return ((I->op == BI_OPCODE_MUX_I32) || (I->op == BI_OPCODE_MUX_V2I16)) && + (I->mux != BI_MUX_BIT) && (I->src[0].swizzle == BI_SWIZZLE_H01) && + (I->src[1].swizzle == BI_SWIZZLE_H01) && + (I->src[2].swizzle == BI_SWIZZLE_H01); } static enum bi_opcode bi_csel_for_mux(bool must_sign, bool b32, enum bi_mux mux) { - switch (mux) { - case BI_MUX_INT_ZERO: - if (must_sign) - return b32 ? BI_OPCODE_CSEL_U32 : BI_OPCODE_CSEL_V2U16; - else - return b32 ? BI_OPCODE_CSEL_I32 : BI_OPCODE_CSEL_V2I16; - case BI_MUX_NEG: - return b32 ? BI_OPCODE_CSEL_S32 : BI_OPCODE_CSEL_V2S16; - case BI_MUX_FP_ZERO: - return b32 ? BI_OPCODE_CSEL_F32 : BI_OPCODE_CSEL_V2F16; - default: - unreachable("No CSEL for MUX.bit"); - } + switch (mux) { + case BI_MUX_INT_ZERO: + if (must_sign) + return b32 ? BI_OPCODE_CSEL_U32 : BI_OPCODE_CSEL_V2U16; + else + return b32 ? BI_OPCODE_CSEL_I32 : BI_OPCODE_CSEL_V2I16; + case BI_MUX_NEG: + return b32 ? BI_OPCODE_CSEL_S32 : BI_OPCODE_CSEL_V2S16; + case BI_MUX_FP_ZERO: + return b32 ? BI_OPCODE_CSEL_F32 : BI_OPCODE_CSEL_V2F16; + default: + unreachable("No CSEL for MUX.bit"); + } } bi_instr * bi_csel_from_mux(bi_builder *b, const bi_instr *I, bool must_sign) { - assert(I->op == BI_OPCODE_MUX_I32 || I->op == BI_OPCODE_MUX_V2I16); + assert(I->op == BI_OPCODE_MUX_I32 || I->op == BI_OPCODE_MUX_V2I16); - /* Build a new CSEL */ - enum bi_cmpf cmpf = (I->mux == BI_MUX_NEG) ? BI_CMPF_LT : BI_CMPF_EQ; - bi_instr *csel = bi_csel_u32_to(b, I->dest[0], I->src[2], bi_zero(), - I->src[0], I->src[1], cmpf); + /* Build a new CSEL */ + enum bi_cmpf cmpf = (I->mux == BI_MUX_NEG) ? BI_CMPF_LT : BI_CMPF_EQ; + bi_instr *csel = bi_csel_u32_to(b, I->dest[0], I->src[2], bi_zero(), + I->src[0], I->src[1], cmpf); - /* Fixup the opcode and use it */ - csel->op = bi_csel_for_mux(must_sign, I->op == BI_OPCODE_MUX_I32, I->mux); - return csel; + /* Fixup the opcode and use it */ + csel->op = bi_csel_for_mux(must_sign, I->op == BI_OPCODE_MUX_I32, I->mux); + return csel; } diff --git a/src/panfrost/bifrost/cmdline.c b/src/panfrost/bifrost/cmdline.c index 2a11486cbed..5dc5b73eab8 100644 --- a/src/panfrost/bifrost/cmdline.c +++ b/src/panfrost/bifrost/cmdline.c @@ -26,15 +26,15 @@ #include #include -#include "disassemble.h" #include "valhall/disassemble.h" #include "compiler.h" +#include "disassemble.h" -#include "main/mtypes.h" -#include "compiler/glsl/standalone.h" -#include "compiler/glsl/glsl_to_nir.h" #include "compiler/glsl/gl_nir.h" +#include "compiler/glsl/glsl_to_nir.h" +#include "compiler/glsl/standalone.h" #include "compiler/nir_types.h" +#include "main/mtypes.h" #include "util/u_dynarray.h" #include "bifrost_compile.h" @@ -44,25 +44,25 @@ int verbose = 0; static gl_shader_stage filename_to_stage(const char *stage) { - const char *ext = strrchr(stage, '.'); + const char *ext = strrchr(stage, '.'); - if (ext == NULL) { - fprintf(stderr, "No extension found in %s\n", stage); - exit(1); - } + if (ext == NULL) { + fprintf(stderr, "No extension found in %s\n", stage); + exit(1); + } - if (!strcmp(ext, ".cs") || !strcmp(ext, ".comp")) - return MESA_SHADER_COMPUTE; - else if (!strcmp(ext, ".vs") || !strcmp(ext, ".vert")) - return MESA_SHADER_VERTEX; - else if (!strcmp(ext, ".fs") || !strcmp(ext, ".frag")) - return MESA_SHADER_FRAGMENT; - else { - fprintf(stderr, "Invalid extension %s\n", ext); - exit(1); - } + if (!strcmp(ext, ".cs") || !strcmp(ext, ".comp")) + return MESA_SHADER_COMPUTE; + else if (!strcmp(ext, ".vs") || !strcmp(ext, ".vert")) + return MESA_SHADER_VERTEX; + else if (!strcmp(ext, ".fs") || !strcmp(ext, ".frag")) + return MESA_SHADER_FRAGMENT; + else { + fprintf(stderr, "Invalid extension %s\n", ext); + exit(1); + } - unreachable("Should've returned or bailed"); + unreachable("Should've returned or bailed"); } static int @@ -80,7 +80,7 @@ glsl_type_size(const struct glsl_type *type, bool bindless) static void insert_sorted(struct exec_list *var_list, nir_variable *new_var) { - nir_foreach_variable_in_list (var, var_list) { + nir_foreach_variable_in_list(var, var_list) { if (var->data.location > new_var->data.location) { exec_node_insert_node_before(&var->node, &new_var->node); return; @@ -94,7 +94,7 @@ sort_varyings(nir_shader *nir, nir_variable_mode mode) { struct exec_list new_list; exec_list_make_empty(&new_list); - nir_foreach_variable_with_modes_safe (var, nir, mode) { + nir_foreach_variable_with_modes_safe(var, nir, mode) { exec_node_remove(&var->node); insert_sorted(&new_list, var); } @@ -104,7 +104,7 @@ sort_varyings(nir_shader *nir, nir_variable_mode mode) static void fixup_varying_slots(nir_shader *nir, nir_variable_mode mode) { - nir_foreach_variable_with_modes (var, nir, mode) { + nir_foreach_variable_with_modes(var, nir, mode) { if (var->data.location >= VARYING_SLOT_VAR0) { var->data.location += 9; } else if ((var->data.location >= VARYING_SLOT_TEX0) && @@ -117,228 +117,219 @@ fixup_varying_slots(nir_shader *nir, nir_variable_mode mode) static void compile_shader(int stages, char **files) { - struct gl_shader_program *prog; - nir_shader *nir[MESA_SHADER_COMPUTE + 1]; - unsigned shader_types[MESA_SHADER_COMPUTE + 1]; + struct gl_shader_program *prog; + nir_shader *nir[MESA_SHADER_COMPUTE + 1]; + unsigned shader_types[MESA_SHADER_COMPUTE + 1]; - if (stages > MESA_SHADER_COMPUTE) { - fprintf(stderr, "Too many stages"); - exit(1); - } + if (stages > MESA_SHADER_COMPUTE) { + fprintf(stderr, "Too many stages"); + exit(1); + } - for (unsigned i = 0; i < stages; ++i) - shader_types[i] = filename_to_stage(files[i]); + for (unsigned i = 0; i < stages; ++i) + shader_types[i] = filename_to_stage(files[i]); - struct standalone_options options = { - .glsl_version = 300, /* ES - needed for precision */ - .do_link = true, - .lower_precision = true - }; + struct standalone_options options = { + .glsl_version = 300, /* ES - needed for precision */ + .do_link = true, + .lower_precision = true}; - static struct gl_context local_ctx; + static struct gl_context local_ctx; - prog = standalone_compile_shader(&options, stages, files, &local_ctx); + prog = standalone_compile_shader(&options, stages, files, &local_ctx); - for (unsigned i = 0; i < stages; ++i) { - gl_shader_stage stage = shader_types[i]; - prog->_LinkedShaders[stage]->Program->info.stage = stage; - } + for (unsigned i = 0; i < stages; ++i) { + gl_shader_stage stage = shader_types[i]; + prog->_LinkedShaders[stage]->Program->info.stage = stage; + } - struct util_dynarray binary; + struct util_dynarray binary; - util_dynarray_init(&binary, NULL); + util_dynarray_init(&binary, NULL); - for (unsigned i = 0; i < stages; ++i) { - nir[i] = glsl_to_nir(&local_ctx.Const, prog, shader_types[i], &bifrost_nir_options); + for (unsigned i = 0; i < stages; ++i) { + nir[i] = glsl_to_nir(&local_ctx.Const, prog, shader_types[i], + &bifrost_nir_options); - if (shader_types[i] == MESA_SHADER_VERTEX) { - nir_assign_var_locations(nir[i], nir_var_shader_in, &nir[i]->num_inputs, - glsl_type_size); - sort_varyings(nir[i], nir_var_shader_out); - nir_assign_var_locations(nir[i], nir_var_shader_out, &nir[i]->num_outputs, - glsl_type_size); - fixup_varying_slots(nir[i], nir_var_shader_out); - } else if (shader_types[i] == MESA_SHADER_FRAGMENT) { - sort_varyings(nir[i], nir_var_shader_in); - nir_assign_var_locations(nir[i], nir_var_shader_in, &nir[i]->num_inputs, - glsl_type_size); - fixup_varying_slots(nir[i], nir_var_shader_in); - nir_assign_var_locations(nir[i], nir_var_shader_out, &nir[i]->num_outputs, - glsl_type_size); - } + if (shader_types[i] == MESA_SHADER_VERTEX) { + nir_assign_var_locations(nir[i], nir_var_shader_in, + &nir[i]->num_inputs, glsl_type_size); + sort_varyings(nir[i], nir_var_shader_out); + nir_assign_var_locations(nir[i], nir_var_shader_out, + &nir[i]->num_outputs, glsl_type_size); + fixup_varying_slots(nir[i], nir_var_shader_out); + } else if (shader_types[i] == MESA_SHADER_FRAGMENT) { + sort_varyings(nir[i], nir_var_shader_in); + nir_assign_var_locations(nir[i], nir_var_shader_in, + &nir[i]->num_inputs, glsl_type_size); + fixup_varying_slots(nir[i], nir_var_shader_in); + nir_assign_var_locations(nir[i], nir_var_shader_out, + &nir[i]->num_outputs, glsl_type_size); + } - nir_assign_var_locations(nir[i], nir_var_uniform, &nir[i]->num_uniforms, - glsl_type_size); + nir_assign_var_locations(nir[i], nir_var_uniform, &nir[i]->num_uniforms, + glsl_type_size); - NIR_PASS_V(nir[i], nir_lower_global_vars_to_local); - NIR_PASS_V(nir[i], nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir[i]), true, i == 0); - NIR_PASS_V(nir[i], nir_opt_copy_prop_vars); - NIR_PASS_V(nir[i], nir_opt_combine_stores, nir_var_all); + NIR_PASS_V(nir[i], nir_lower_global_vars_to_local); + NIR_PASS_V(nir[i], nir_lower_io_to_temporaries, + nir_shader_get_entrypoint(nir[i]), true, i == 0); + NIR_PASS_V(nir[i], nir_opt_copy_prop_vars); + NIR_PASS_V(nir[i], nir_opt_combine_stores, nir_var_all); - NIR_PASS_V(nir[i], nir_lower_system_values); - NIR_PASS_V(nir[i], gl_nir_lower_samplers, prog); - NIR_PASS_V(nir[i], nir_split_var_copies); - NIR_PASS_V(nir[i], nir_lower_var_copies); + NIR_PASS_V(nir[i], nir_lower_system_values); + NIR_PASS_V(nir[i], gl_nir_lower_samplers, prog); + NIR_PASS_V(nir[i], nir_split_var_copies); + NIR_PASS_V(nir[i], nir_lower_var_copies); - NIR_PASS_V(nir[i], nir_lower_io, nir_var_uniform, - st_packed_uniforms_type_size, - (nir_lower_io_options)0); - NIR_PASS_V(nir[i], nir_lower_uniforms_to_ubo, true, false); + NIR_PASS_V(nir[i], nir_lower_io, nir_var_uniform, + st_packed_uniforms_type_size, (nir_lower_io_options)0); + NIR_PASS_V(nir[i], nir_lower_uniforms_to_ubo, true, false); - /* before buffers and vars_to_ssa */ - NIR_PASS_V(nir[i], gl_nir_lower_images, true); + /* before buffers and vars_to_ssa */ + NIR_PASS_V(nir[i], gl_nir_lower_images, true); - NIR_PASS_V(nir[i], gl_nir_lower_buffers, prog); - NIR_PASS_V(nir[i], nir_opt_constant_folding); + NIR_PASS_V(nir[i], gl_nir_lower_buffers, prog); + NIR_PASS_V(nir[i], nir_opt_constant_folding); - struct panfrost_compile_inputs inputs = { - .gpu_id = gpu_id, - .fixed_sysval_ubo = -1, - }; - struct pan_shader_info info = { 0 }; + struct panfrost_compile_inputs inputs = { + .gpu_id = gpu_id, + .fixed_sysval_ubo = -1, + }; + struct pan_shader_info info = {0}; - util_dynarray_clear(&binary); - bifrost_compile_shader_nir(nir[i], &inputs, &binary, &info); + util_dynarray_clear(&binary); + bifrost_compile_shader_nir(nir[i], &inputs, &binary, &info); - char *fn = NULL; - asprintf(&fn, "shader_%u.bin", i); - assert(fn != NULL); - FILE *fp = fopen(fn, "wb"); - fwrite(binary.data, 1, binary.size, fp); - fclose(fp); - free(fn); - } + char *fn = NULL; + asprintf(&fn, "shader_%u.bin", i); + assert(fn != NULL); + FILE *fp = fopen(fn, "wb"); + fwrite(binary.data, 1, binary.size, fp); + fclose(fp); + free(fn); + } - util_dynarray_fini(&binary); + util_dynarray_fini(&binary); } -#define BI_FOURCC(ch0, ch1, ch2, ch3) ( \ - (uint32_t)(ch0) | (uint32_t)(ch1) << 8 | \ - (uint32_t)(ch2) << 16 | (uint32_t)(ch3) << 24) +#define BI_FOURCC(ch0, ch1, ch2, ch3) \ + ((uint32_t)(ch0) | (uint32_t)(ch1) << 8 | (uint32_t)(ch2) << 16 | \ + (uint32_t)(ch3) << 24) static void disassemble(const char *filename) { - FILE *fp = fopen(filename, "rb"); - assert(fp); + FILE *fp = fopen(filename, "rb"); + assert(fp); - fseek(fp, 0, SEEK_END); - unsigned filesize = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + unsigned filesize = ftell(fp); + rewind(fp); - uint32_t *code = malloc(filesize); - unsigned res = fread(code, 1, filesize, fp); - if (res != filesize) { - printf("Couldn't read full file\n"); - } + uint32_t *code = malloc(filesize); + unsigned res = fread(code, 1, filesize, fp); + if (res != filesize) { + printf("Couldn't read full file\n"); + } - fclose(fp); + fclose(fp); - void *entrypoint = code; + void *entrypoint = code; - if (filesize && code[0] == BI_FOURCC('M', 'B', 'S', '2')) { - for (int i = 0; i < filesize / 4; ++i) { - if (code[i] != BI_FOURCC('O', 'B', 'J', 'C')) - continue; + if (filesize && code[0] == BI_FOURCC('M', 'B', 'S', '2')) { + for (int i = 0; i < filesize / 4; ++i) { + if (code[i] != BI_FOURCC('O', 'B', 'J', 'C')) + continue; - unsigned size = code[i + 1]; - unsigned offset = i + 2; + unsigned size = code[i + 1]; + unsigned offset = i + 2; - entrypoint = code + offset; - filesize = size; - } - } + entrypoint = code + offset; + filesize = size; + } + } - if ((gpu_id >> 12) >= 9) - disassemble_valhall(stdout, entrypoint, filesize, verbose); - else - disassemble_bifrost(stdout, entrypoint, filesize, verbose); + if ((gpu_id >> 12) >= 9) + disassemble_valhall(stdout, entrypoint, filesize, verbose); + else + disassemble_bifrost(stdout, entrypoint, filesize, verbose); - free(code); + free(code); } int main(int argc, char **argv) { - int c; + int c; - if (argc < 2) { - printf("Pass a command\n"); - exit(1); - } + if (argc < 2) { + printf("Pass a command\n"); + exit(1); + } - static struct option longopts[] = { - { "id", optional_argument, NULL, 'i' }, - { "gpu", optional_argument, NULL, 'g' }, - { "verbose", no_argument, &verbose, 'v' }, - { NULL, 0, NULL, 0 } - }; + static struct option longopts[] = {{"id", optional_argument, NULL, 'i'}, + {"gpu", optional_argument, NULL, 'g'}, + {"verbose", no_argument, &verbose, 'v'}, + {NULL, 0, NULL, 0}}; - static struct { - const char *name; - unsigned major, minor; - } gpus[] = { - { "G71", 6, 0 }, - { "G72", 6, 2 }, - { "G51", 7, 0 }, - { "G76", 7, 1 }, - { "G52", 7, 2 }, - { "G31", 7, 3 }, - { "G77", 9, 0 }, - { "G57", 9, 1 }, - { "G78", 9, 2 }, - { "G57", 9, 3 }, - { "G68", 9, 4 }, - { "G78AE", 9, 5 }, - }; + static struct { + const char *name; + unsigned major, minor; + } gpus[] = { + {"G71", 6, 0}, {"G72", 6, 2}, {"G51", 7, 0}, {"G76", 7, 1}, + {"G52", 7, 2}, {"G31", 7, 3}, {"G77", 9, 0}, {"G57", 9, 1}, + {"G78", 9, 2}, {"G57", 9, 3}, {"G68", 9, 4}, {"G78AE", 9, 5}, + }; - while ((c = getopt_long(argc, argv, "v:", longopts, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "v:", longopts, NULL)) != -1) { - switch (c) { - case 'i': - gpu_id = atoi(optarg); + switch (c) { + case 'i': + gpu_id = atoi(optarg); - if (!gpu_id) { - fprintf(stderr, "Expected GPU ID, got %s\n", optarg); - return 1; - } + if (!gpu_id) { + fprintf(stderr, "Expected GPU ID, got %s\n", optarg); + return 1; + } - break; - case 'g': - gpu_id = 0; + break; + case 'g': + gpu_id = 0; - /* Compatibility with the Arm compiler */ - if (strncmp(optarg, "Mali-", 5) == 0) optarg += 5; + /* Compatibility with the Arm compiler */ + if (strncmp(optarg, "Mali-", 5) == 0) + optarg += 5; - for (unsigned i = 0; i < ARRAY_SIZE(gpus); ++i) { - if (strcmp(gpus[i].name, optarg)) continue; + for (unsigned i = 0; i < ARRAY_SIZE(gpus); ++i) { + if (strcmp(gpus[i].name, optarg)) + continue; - unsigned major = gpus[i].major; - unsigned minor = gpus[i].minor; + unsigned major = gpus[i].major; + unsigned minor = gpus[i].minor; - gpu_id = (major << 12) | (minor << 8); - break; - } + gpu_id = (major << 12) | (minor << 8); + break; + } - if (!gpu_id) { - fprintf(stderr, "Unknown GPU %s\n", optarg); - return 1; - } + if (!gpu_id) { + fprintf(stderr, "Unknown GPU %s\n", optarg); + return 1; + } - break; - default: - break; - } - } + break; + default: + break; + } + } - if (strcmp(argv[optind], "compile") == 0) - compile_shader(argc - optind - 1, &argv[optind + 1]); - else if (strcmp(argv[optind], "disasm") == 0) - disassemble(argv[optind + 1]); - else { - fprintf(stderr, "Unknown command. Valid: compile/disasm\n"); - return 1; - } + if (strcmp(argv[optind], "compile") == 0) + compile_shader(argc - optind - 1, &argv[optind + 1]); + else if (strcmp(argv[optind], "disasm") == 0) + disassemble(argv[optind + 1]); + else { + fprintf(stderr, "Unknown command. Valid: compile/disasm\n"); + return 1; + } - return 0; + return 0; } diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 1502560bd18..cae4dd2e351 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -27,13 +27,13 @@ #ifndef __BIFROST_COMPILER_H #define __BIFROST_COMPILER_H -#include "bifrost.h" -#include "bi_opcodes.h" #include "compiler/nir/nir.h" #include "panfrost/util/pan_ir.h" -#include "util/u_math.h" #include "util/half_float.h" +#include "util/u_math.h" #include "util/u_worklist.h" +#include "bi_opcodes.h" +#include "bifrost.h" #ifdef __cplusplus extern "C" { @@ -49,25 +49,25 @@ extern "C" { */ enum bi_swizzle { - /* 16-bit swizzle ordering deliberate for fast compute */ - BI_SWIZZLE_H00 = 0, /* = B0101 */ - BI_SWIZZLE_H01 = 1, /* = B0123 = W0 */ - BI_SWIZZLE_H10 = 2, /* = B2301 */ - BI_SWIZZLE_H11 = 3, /* = B2323 */ + /* 16-bit swizzle ordering deliberate for fast compute */ + BI_SWIZZLE_H00 = 0, /* = B0101 */ + BI_SWIZZLE_H01 = 1, /* = B0123 = W0 */ + BI_SWIZZLE_H10 = 2, /* = B2301 */ + BI_SWIZZLE_H11 = 3, /* = B2323 */ - /* replication order should be maintained for fast compute */ - BI_SWIZZLE_B0000 = 4, /* single channel (replicate) */ - BI_SWIZZLE_B1111 = 5, - BI_SWIZZLE_B2222 = 6, - BI_SWIZZLE_B3333 = 7, + /* replication order should be maintained for fast compute */ + BI_SWIZZLE_B0000 = 4, /* single channel (replicate) */ + BI_SWIZZLE_B1111 = 5, + BI_SWIZZLE_B2222 = 6, + BI_SWIZZLE_B3333 = 7, - /* totally special for explicit pattern matching */ - BI_SWIZZLE_B0011 = 8, /* +SWZ.v4i8 */ - BI_SWIZZLE_B2233 = 9, /* +SWZ.v4i8 */ - BI_SWIZZLE_B1032 = 10, /* +SWZ.v4i8 */ - BI_SWIZZLE_B3210 = 11, /* +SWZ.v4i8 */ + /* totally special for explicit pattern matching */ + BI_SWIZZLE_B0011 = 8, /* +SWZ.v4i8 */ + BI_SWIZZLE_B2233 = 9, /* +SWZ.v4i8 */ + BI_SWIZZLE_B1032 = 10, /* +SWZ.v4i8 */ + BI_SWIZZLE_B3210 = 11, /* +SWZ.v4i8 */ - BI_SWIZZLE_B0022 = 12, /* for b02 lanes */ + BI_SWIZZLE_B0022 = 12, /* for b02 lanes */ }; /* Given a packed i16vec2/i8vec4 constant, apply a swizzle. Useful for constant @@ -76,26 +76,39 @@ enum bi_swizzle { static inline uint32_t bi_apply_swizzle(uint32_t value, enum bi_swizzle swz) { - const uint16_t *h = (const uint16_t *) &value; - const uint8_t *b = (const uint8_t *) &value; + const uint16_t *h = (const uint16_t *)&value; + const uint8_t *b = (const uint8_t *)&value; -#define H(h0, h1) (h[h0] | (h[h1] << 16)) +#define H(h0, h1) (h[h0] | (h[h1] << 16)) #define B(b0, b1, b2, b3) (b[b0] | (b[b1] << 8) | (b[b2] << 16) | (b[b3] << 24)) switch (swz) { - case BI_SWIZZLE_H00: return H(0, 0); - case BI_SWIZZLE_H01: return H(0, 1); - case BI_SWIZZLE_H10: return H(1, 0); - case BI_SWIZZLE_H11: return H(1, 1); - case BI_SWIZZLE_B0000: return B(0, 0, 0, 0); - case BI_SWIZZLE_B1111: return B(1, 1, 1, 1); - case BI_SWIZZLE_B2222: return B(2, 2, 2, 2); - case BI_SWIZZLE_B3333: return B(3, 3, 3, 3); - case BI_SWIZZLE_B0011: return B(0, 0, 1, 1); - case BI_SWIZZLE_B2233: return B(2, 2, 3, 3); - case BI_SWIZZLE_B1032: return B(1, 0, 3, 2); - case BI_SWIZZLE_B3210: return B(3, 2, 1, 0); - case BI_SWIZZLE_B0022: return B(0, 0, 2, 2); + case BI_SWIZZLE_H00: + return H(0, 0); + case BI_SWIZZLE_H01: + return H(0, 1); + case BI_SWIZZLE_H10: + return H(1, 0); + case BI_SWIZZLE_H11: + return H(1, 1); + case BI_SWIZZLE_B0000: + return B(0, 0, 0, 0); + case BI_SWIZZLE_B1111: + return B(1, 1, 1, 1); + case BI_SWIZZLE_B2222: + return B(2, 2, 2, 2); + case BI_SWIZZLE_B3333: + return B(3, 3, 3, 3); + case BI_SWIZZLE_B0011: + return B(0, 0, 1, 1); + case BI_SWIZZLE_B2233: + return B(2, 2, 3, 3); + case BI_SWIZZLE_B1032: + return B(1, 0, 3, 2); + case BI_SWIZZLE_B3210: + return B(3, 2, 1, 0); + case BI_SWIZZLE_B0022: + return B(0, 0, 2, 2); } #undef H @@ -105,148 +118,148 @@ bi_apply_swizzle(uint32_t value, enum bi_swizzle swz) } enum bi_index_type { - BI_INDEX_NULL = 0, - BI_INDEX_NORMAL = 1, - BI_INDEX_REGISTER = 2, - BI_INDEX_CONSTANT = 3, - BI_INDEX_PASS = 4, - BI_INDEX_FAU = 5 + BI_INDEX_NULL = 0, + BI_INDEX_NORMAL = 1, + BI_INDEX_REGISTER = 2, + BI_INDEX_CONSTANT = 3, + BI_INDEX_PASS = 4, + BI_INDEX_FAU = 5 }; typedef struct { - uint32_t value; + uint32_t value; - /* modifiers, should only be set if applicable for a given instruction. - * For *IDP.v4i8, abs plays the role of sign. For bitwise ops where - * applicable, neg plays the role of not */ - bool abs : 1; - bool neg : 1; + /* modifiers, should only be set if applicable for a given instruction. + * For *IDP.v4i8, abs plays the role of sign. For bitwise ops where + * applicable, neg plays the role of not */ + bool abs : 1; + bool neg : 1; - /* The last use of a value, should be purged from the register cache. - * Set by liveness analysis. */ - bool discard : 1; + /* The last use of a value, should be purged from the register cache. + * Set by liveness analysis. */ + bool discard : 1; - /* For a source, the swizzle. For a destination, acts a bit like a - * write mask. Identity for the full 32-bit, H00 for only caring about - * the lower half, other values unused. */ - enum bi_swizzle swizzle : 4; - uint32_t offset : 3; - enum bi_index_type type : 3; + /* For a source, the swizzle. For a destination, acts a bit like a + * write mask. Identity for the full 32-bit, H00 for only caring about + * the lower half, other values unused. */ + enum bi_swizzle swizzle : 4; + uint32_t offset : 3; + enum bi_index_type type : 3; - /* Must be zeroed so we can hash the whole 64-bits at a time */ - unsigned padding : (32 - 13); + /* Must be zeroed so we can hash the whole 64-bits at a time */ + unsigned padding : (32 - 13); } bi_index; static inline bi_index bi_get_index(unsigned value) { - return (bi_index) { - .value = value, - .swizzle = BI_SWIZZLE_H01, - .type = BI_INDEX_NORMAL, - }; + return (bi_index){ + .value = value, + .swizzle = BI_SWIZZLE_H01, + .type = BI_INDEX_NORMAL, + }; } static inline bi_index bi_register(unsigned reg) { - assert(reg < 64); + assert(reg < 64); - return (bi_index) { - .value = reg, - .swizzle = BI_SWIZZLE_H01, - .type = BI_INDEX_REGISTER, - }; + return (bi_index){ + .value = reg, + .swizzle = BI_SWIZZLE_H01, + .type = BI_INDEX_REGISTER, + }; } static inline bi_index bi_imm_u32(uint32_t imm) { - return (bi_index) { - .value = imm, - .swizzle = BI_SWIZZLE_H01, - .type = BI_INDEX_CONSTANT, - }; + return (bi_index){ + .value = imm, + .swizzle = BI_SWIZZLE_H01, + .type = BI_INDEX_CONSTANT, + }; } static inline bi_index bi_imm_f32(float imm) { - return bi_imm_u32(fui(imm)); + return bi_imm_u32(fui(imm)); } static inline bi_index bi_null() { - return (bi_index) { .type = BI_INDEX_NULL }; + return (bi_index){.type = BI_INDEX_NULL}; } static inline bi_index bi_zero() { - return bi_imm_u32(0); + return bi_imm_u32(0); } static inline bi_index bi_passthrough(enum bifrost_packed_src value) { - return (bi_index) { - .value = value, - .swizzle = BI_SWIZZLE_H01, - .type = BI_INDEX_PASS, - }; + return (bi_index){ + .value = value, + .swizzle = BI_SWIZZLE_H01, + .type = BI_INDEX_PASS, + }; } /* Helps construct swizzles */ static inline bi_index bi_swz_16(bi_index idx, bool x, bool y) { - assert(idx.swizzle == BI_SWIZZLE_H01); - idx.swizzle = (enum bi_swizzle)(BI_SWIZZLE_H00 | (x << 1) | y); - return idx; + assert(idx.swizzle == BI_SWIZZLE_H01); + idx.swizzle = (enum bi_swizzle)(BI_SWIZZLE_H00 | (x << 1) | y); + return idx; } static inline bi_index bi_half(bi_index idx, bool upper) { - return bi_swz_16(idx, upper, upper); + return bi_swz_16(idx, upper, upper); } static inline bi_index bi_byte(bi_index idx, unsigned lane) { - assert(idx.swizzle == BI_SWIZZLE_H01); - assert(lane < 4); - idx.swizzle = (enum bi_swizzle)(BI_SWIZZLE_B0000 + lane); - return idx; + assert(idx.swizzle == BI_SWIZZLE_H01); + assert(lane < 4); + idx.swizzle = (enum bi_swizzle)(BI_SWIZZLE_B0000 + lane); + return idx; } static inline bi_index bi_abs(bi_index idx) { - idx.abs = true; - return idx; + idx.abs = true; + return idx; } static inline bi_index bi_neg(bi_index idx) { - idx.neg ^= true; - return idx; + idx.neg ^= true; + return idx; } static inline bi_index bi_discard(bi_index idx) { - idx.discard = true; - return idx; + idx.discard = true; + return idx; } /* Additive identity in IEEE 754 arithmetic */ static inline bi_index bi_negzero() { - return bi_neg(bi_zero()); + return bi_neg(bi_zero()); } /* Replaces an index, preserving any modifiers */ @@ -254,11 +267,11 @@ bi_negzero() static inline bi_index bi_replace_index(bi_index old, bi_index replacement) { - replacement.abs = old.abs; - replacement.neg = old.neg; - replacement.swizzle = old.swizzle; - replacement.discard = false; /* needs liveness analysis to set */ - return replacement; + replacement.abs = old.abs; + replacement.neg = old.neg; + replacement.swizzle = old.swizzle; + replacement.discard = false; /* needs liveness analysis to set */ + return replacement; } /* Remove any modifiers. This has the property: @@ -270,9 +283,9 @@ bi_replace_index(bi_index old, bi_index replacement) static inline bi_index bi_strip_index(bi_index index) { - index.abs = index.neg = false; - index.swizzle = BI_SWIZZLE_H01; - return index; + index.abs = index.neg = false; + index.swizzle = BI_SWIZZLE_H01; + return index; } /* For bitwise instructions */ @@ -281,40 +294,40 @@ bi_strip_index(bi_index index) static inline bi_index bi_imm_u8(uint8_t imm) { - return bi_byte(bi_imm_u32(imm), 0); + return bi_byte(bi_imm_u32(imm), 0); } static inline bi_index bi_imm_u16(uint16_t imm) { - return bi_half(bi_imm_u32(imm), false); + return bi_half(bi_imm_u32(imm), false); } static inline bi_index bi_imm_uintN(uint32_t imm, unsigned sz) { - assert(sz == 8 || sz == 16 || sz == 32); - return (sz == 8) ? bi_imm_u8(imm) : - (sz == 16) ? bi_imm_u16(imm) : - bi_imm_u32(imm); + assert(sz == 8 || sz == 16 || sz == 32); + return (sz == 8) ? bi_imm_u8(imm) + : (sz == 16) ? bi_imm_u16(imm) + : bi_imm_u32(imm); } static inline bi_index bi_imm_f16(float imm) { - return bi_imm_u16(_mesa_float_to_half(imm)); + return bi_imm_u16(_mesa_float_to_half(imm)); } static inline bool bi_is_null(bi_index idx) { - return idx.type == BI_INDEX_NULL; + return idx.type == BI_INDEX_NULL; } static inline bool bi_is_ssa(bi_index idx) { - return idx.type == BI_INDEX_NORMAL; + return idx.type == BI_INDEX_NORMAL; } /* Compares equivalence as references. Does not compare offsets, swizzles, or @@ -324,8 +337,7 @@ bi_is_ssa(bi_index idx) static inline bool bi_is_equiv(bi_index left, bi_index right) { - return (left.type == right.type) && - (left.value == right.value); + return (left.type == right.type) && (left.value == right.value); } /* A stronger equivalence relation that requires the indices access the @@ -335,7 +347,7 @@ bi_is_equiv(bi_index left, bi_index right) static inline bool bi_is_word_equiv(bi_index left, bi_index right) { - return bi_is_equiv(left, right) && left.offset == right.offset; + return bi_is_equiv(left, right) && left.offset == right.offset; } /* An even stronger equivalence that checks if indices correspond to the @@ -344,207 +356,203 @@ bi_is_word_equiv(bi_index left, bi_index right) static inline bool bi_is_value_equiv(bi_index left, bi_index right) { - if (left.type == BI_INDEX_CONSTANT && right.type == BI_INDEX_CONSTANT) { - return (bi_apply_swizzle(left.value, left.swizzle) == - bi_apply_swizzle(right.value, right.swizzle)) && - (left.abs == right.abs) && - (left.neg == right.neg); - } else { - return (left.value == right.value) && - (left.abs == right.abs) && - (left.neg == right.neg) && - (left.swizzle == right.swizzle) && - (left.offset == right.offset) && - (left.type == right.type); - } + if (left.type == BI_INDEX_CONSTANT && right.type == BI_INDEX_CONSTANT) { + return (bi_apply_swizzle(left.value, left.swizzle) == + bi_apply_swizzle(right.value, right.swizzle)) && + (left.abs == right.abs) && (left.neg == right.neg); + } else { + return (left.value == right.value) && (left.abs == right.abs) && + (left.neg == right.neg) && (left.swizzle == right.swizzle) && + (left.offset == right.offset) && (left.type == right.type); + } } -#define BI_MAX_VEC 8 +#define BI_MAX_VEC 8 #define BI_MAX_DESTS 4 -#define BI_MAX_SRCS 6 +#define BI_MAX_SRCS 6 typedef struct { - /* Must be first */ - struct list_head link; - bi_index *dest; - bi_index *src; + /* Must be first */ + struct list_head link; + bi_index *dest; + bi_index *src; - enum bi_opcode op; - uint8_t nr_srcs; - uint8_t nr_dests; + enum bi_opcode op; + uint8_t nr_srcs; + uint8_t nr_dests; - union { - /* For a branch */ - struct bi_block *branch_target; + union { + /* For a branch */ + struct bi_block *branch_target; - /* For a phi node that hasn't been translated yet. This is only - * used during NIR->BIR - */ - nir_phi_instr *phi; - }; + /* For a phi node that hasn't been translated yet. This is only + * used during NIR->BIR + */ + nir_phi_instr *phi; + }; - /* These don't fit neatly with anything else.. */ - enum bi_register_format register_format; - enum bi_vecsize vecsize; + /* These don't fit neatly with anything else.. */ + enum bi_register_format register_format; + enum bi_vecsize vecsize; - /* Flow control associated with a Valhall instruction */ - uint8_t flow; + /* Flow control associated with a Valhall instruction */ + uint8_t flow; - /* Slot associated with a message-passing instruction */ - uint8_t slot; + /* Slot associated with a message-passing instruction */ + uint8_t slot; - /* Can we spill the value written here? Used to prevent - * useless double fills */ - bool no_spill; + /* Can we spill the value written here? Used to prevent + * useless double fills */ + bool no_spill; - /* On Bifrost: A value of bi_table to override the table, inducing a - * DTSEL_IMM pair if nonzero. - * - * On Valhall: the table index to use for resource instructions. - * - * These two interpretations are equivalent if you squint a bit. - */ - unsigned table; + /* On Bifrost: A value of bi_table to override the table, inducing a + * DTSEL_IMM pair if nonzero. + * + * On Valhall: the table index to use for resource instructions. + * + * These two interpretations are equivalent if you squint a bit. + */ + unsigned table; - /* Everything after this MUST NOT be accessed directly, since - * interpretation depends on opcodes */ + /* Everything after this MUST NOT be accessed directly, since + * interpretation depends on opcodes */ - /* Destination modifiers */ - union { - enum bi_clamp clamp; - bool saturate; - bool not_result; - unsigned dest_mod; - }; + /* Destination modifiers */ + union { + enum bi_clamp clamp; + bool saturate; + bool not_result; + unsigned dest_mod; + }; - /* Immediates. All seen alone in an instruction, except for varying/texture - * which are specified jointly for VARTEX */ - union { - uint32_t shift; - uint32_t fill; - uint32_t index; - uint32_t attribute_index; + /* Immediates. All seen alone in an instruction, except for varying/texture + * which are specified jointly for VARTEX */ + union { + uint32_t shift; + uint32_t fill; + uint32_t index; + uint32_t attribute_index; - struct { - uint32_t varying_index; - uint32_t sampler_index; - uint32_t texture_index; - }; + struct { + uint32_t varying_index; + uint32_t sampler_index; + uint32_t texture_index; + }; - /* TEXC, ATOM_CX: # of staging registers used */ - struct { - uint32_t sr_count; - uint32_t sr_count_2; + /* TEXC, ATOM_CX: # of staging registers used */ + struct { + uint32_t sr_count; + uint32_t sr_count_2; - union { - /* Atomics effectively require all three */ - int32_t byte_offset; + union { + /* Atomics effectively require all three */ + int32_t byte_offset; - /* BLEND requires all three */ - int32_t branch_offset; - }; - }; - }; + /* BLEND requires all three */ + int32_t branch_offset; + }; + }; + }; - /* Modifiers specific to particular instructions are thrown in a union */ - union { - enum bi_adj adj; /* FEXP_TABLE.u4 */ - enum bi_atom_opc atom_opc; /* atomics */ - enum bi_func func; /* FPOW_SC_DET */ - enum bi_function function; /* LD_VAR_FLAT */ - enum bi_mux mux; /* MUX */ - enum bi_sem sem; /* FMAX, FMIN */ - enum bi_source source; /* LD_GCLK */ - bool scale; /* VN_ASST2, FSINCOS_OFFSET */ - bool offset; /* FSIN_TABLE, FOCS_TABLE */ - bool mask; /* CLZ */ - bool threads; /* IMULD, IMOV_FMA */ - bool combine; /* BRANCHC */ - bool format; /* LEA_TEX */ + /* Modifiers specific to particular instructions are thrown in a union */ + union { + enum bi_adj adj; /* FEXP_TABLE.u4 */ + enum bi_atom_opc atom_opc; /* atomics */ + enum bi_func func; /* FPOW_SC_DET */ + enum bi_function function; /* LD_VAR_FLAT */ + enum bi_mux mux; /* MUX */ + enum bi_sem sem; /* FMAX, FMIN */ + enum bi_source source; /* LD_GCLK */ + bool scale; /* VN_ASST2, FSINCOS_OFFSET */ + bool offset; /* FSIN_TABLE, FOCS_TABLE */ + bool mask; /* CLZ */ + bool threads; /* IMULD, IMOV_FMA */ + bool combine; /* BRANCHC */ + bool format; /* LEA_TEX */ - struct { - enum bi_special special; /* FADD_RSCALE, FMA_RSCALE */ - enum bi_round round; /* FMA, converts, FADD, _RSCALE, etc */ - bool ftz; /* Flush-to-zero for F16_TO_F32 */ - }; + struct { + enum bi_special special; /* FADD_RSCALE, FMA_RSCALE */ + enum bi_round round; /* FMA, converts, FADD, _RSCALE, etc */ + bool ftz; /* Flush-to-zero for F16_TO_F32 */ + }; - struct { - enum bi_result_type result_type; /* FCMP, ICMP */ - enum bi_cmpf cmpf; /* CSEL, FCMP, ICMP, BRANCH */ - }; + struct { + enum bi_result_type result_type; /* FCMP, ICMP */ + enum bi_cmpf cmpf; /* CSEL, FCMP, ICMP, BRANCH */ + }; - struct { - enum bi_stack_mode stack_mode; /* JUMP_EX */ - bool test_mode; - }; + struct { + enum bi_stack_mode stack_mode; /* JUMP_EX */ + bool test_mode; + }; - struct { - enum bi_seg seg; /* LOAD, STORE, SEG_ADD, SEG_SUB */ - bool preserve_null; /* SEG_ADD, SEG_SUB */ - enum bi_extend extend; /* LOAD, IMUL */ - }; + struct { + enum bi_seg seg; /* LOAD, STORE, SEG_ADD, SEG_SUB */ + bool preserve_null; /* SEG_ADD, SEG_SUB */ + enum bi_extend extend; /* LOAD, IMUL */ + }; - struct { - enum bi_sample sample; /* VAR_TEX, LD_VAR */ - enum bi_update update; /* VAR_TEX, LD_VAR */ - enum bi_varying_name varying_name; /* LD_VAR_SPECIAL */ - bool skip; /* VAR_TEX, TEXS, TEXC */ - bool lod_mode; /* VAR_TEX, TEXS, implicitly for TEXC */ - enum bi_source_format source_format; /* LD_VAR_BUF */ + struct { + enum bi_sample sample; /* VAR_TEX, LD_VAR */ + enum bi_update update; /* VAR_TEX, LD_VAR */ + enum bi_varying_name varying_name; /* LD_VAR_SPECIAL */ + bool skip; /* VAR_TEX, TEXS, TEXC */ + bool lod_mode; /* VAR_TEX, TEXS, implicitly for TEXC */ + enum bi_source_format source_format; /* LD_VAR_BUF */ - /* Used for valhall texturing */ - bool shadow; - bool texel_offset; - bool array_enable; - bool integer_coordinates; - enum bi_fetch_component fetch_component; - enum bi_va_lod_mode va_lod_mode; - enum bi_dimension dimension; - enum bi_write_mask write_mask; - }; + /* Used for valhall texturing */ + bool shadow; + bool texel_offset; + bool array_enable; + bool integer_coordinates; + enum bi_fetch_component fetch_component; + enum bi_va_lod_mode va_lod_mode; + enum bi_dimension dimension; + enum bi_write_mask write_mask; + }; - /* Maximum size, for hashing */ - unsigned flags[14]; + /* Maximum size, for hashing */ + unsigned flags[14]; - struct { - enum bi_subgroup subgroup; /* WMASK, CLPER */ - enum bi_inactive_result inactive_result; /* CLPER */ - enum bi_lane_op lane_op; /* CLPER */ - }; + struct { + enum bi_subgroup subgroup; /* WMASK, CLPER */ + enum bi_inactive_result inactive_result; /* CLPER */ + enum bi_lane_op lane_op; /* CLPER */ + }; - struct { - bool z; /* ZS_EMIT */ - bool stencil; /* ZS_EMIT */ - }; + struct { + bool z; /* ZS_EMIT */ + bool stencil; /* ZS_EMIT */ + }; - struct { - bool h; /* VN_ASST1.f16 */ - bool l; /* VN_ASST1.f16 */ - }; + struct { + bool h; /* VN_ASST1.f16 */ + bool l; /* VN_ASST1.f16 */ + }; - struct { - bool bytes2; /* RROT_DOUBLE, FRSHIFT_DOUBLE */ - bool result_word; - bool arithmetic; /* ARSHIFT_OR */ - }; + struct { + bool bytes2; /* RROT_DOUBLE, FRSHIFT_DOUBLE */ + bool result_word; + bool arithmetic; /* ARSHIFT_OR */ + }; - struct { - bool sqrt; /* FREXPM */ - bool log; /* FREXPM */ - }; + struct { + bool sqrt; /* FREXPM */ + bool log; /* FREXPM */ + }; - struct { - enum bi_mode mode; /* FLOG_TABLE */ - enum bi_precision precision; /* FLOG_TABLE */ - bool divzero; /* FRSQ_APPROX, FRSQ */ - }; - }; + struct { + enum bi_mode mode; /* FLOG_TABLE */ + enum bi_precision precision; /* FLOG_TABLE */ + bool divzero; /* FRSQ_APPROX, FRSQ */ + }; + }; } bi_instr; static inline bool bi_is_staging_src(const bi_instr *I, unsigned s) { - return (s == 0 || s == 4) && bi_opcode_props[I->op].sr_read; + return (s == 0 || s == 4) && bi_opcode_props[I->op].sr_read; } /* @@ -555,48 +563,48 @@ bi_is_staging_src(const bi_instr *I, unsigned s) static inline void bi_drop_dests(bi_instr *I, unsigned new_count) { - assert(new_count < I->nr_dests); + assert(new_count < I->nr_dests); - for (unsigned i = new_count; i < I->nr_dests; ++i) - I->dest[i] = bi_null(); + for (unsigned i = new_count; i < I->nr_dests; ++i) + I->dest[i] = bi_null(); - I->nr_dests = new_count; + I->nr_dests = new_count; } static inline void bi_drop_srcs(bi_instr *I, unsigned new_count) { - assert(new_count < I->nr_srcs); + assert(new_count < I->nr_srcs); - for (unsigned i = new_count; i < I->nr_srcs; ++i) - I->src[i] = bi_null(); + for (unsigned i = new_count; i < I->nr_srcs; ++i) + I->src[i] = bi_null(); - I->nr_srcs = new_count; + I->nr_srcs = new_count; } static inline void bi_replace_src(bi_instr *I, unsigned src_index, bi_index replacement) { - I->src[src_index] = bi_replace_index(I->src[src_index], replacement); + I->src[src_index] = bi_replace_index(I->src[src_index], replacement); } /* Represents the assignment of slots for a given bi_tuple */ typedef struct { - /* Register to assign to each slot */ - unsigned slot[4]; + /* Register to assign to each slot */ + unsigned slot[4]; - /* Read slots can be disabled */ - bool enabled[2]; + /* Read slots can be disabled */ + bool enabled[2]; - /* Configuration for slots 2/3 */ - struct bifrost_reg_ctrl_23 slot23; + /* Configuration for slots 2/3 */ + struct bifrost_reg_ctrl_23 slot23; - /* Fast-Access-Uniform RAM index */ - uint8_t fau_idx; + /* Fast-Access-Uniform RAM index */ + uint8_t fau_idx; - /* Whether writes are actually for the last instruction */ - bool first_instruction; + /* Whether writes are actually for the last instruction */ + bool first_instruction; } bi_registers; /* A bi_tuple contains two paired instruction pointers. If a slot is unfilled, @@ -605,307 +613,307 @@ typedef struct { */ typedef struct { - uint8_t fau_idx; - bi_registers regs; - bi_instr *fma; - bi_instr *add; + uint8_t fau_idx; + bi_registers regs; + bi_instr *fma; + bi_instr *add; } bi_tuple; struct bi_block; typedef struct { - struct list_head link; + struct list_head link; - /* Link back up for branch calculations */ - struct bi_block *block; + /* Link back up for branch calculations */ + struct bi_block *block; - /* Architectural limit of 8 tuples/clause */ - unsigned tuple_count; - bi_tuple tuples[8]; + /* Architectural limit of 8 tuples/clause */ + unsigned tuple_count; + bi_tuple tuples[8]; - /* For scoreboarding -- the clause ID (this is not globally unique!) - * and its dependencies in terms of other clauses, computed during - * scheduling and used when emitting code. Dependencies expressed as a - * bitfield matching the hardware, except shifted by a clause (the - * shift back to the ISA's off-by-one encoding is worked out when - * emitting clauses) */ - unsigned scoreboard_id; - uint8_t dependencies; + /* For scoreboarding -- the clause ID (this is not globally unique!) + * and its dependencies in terms of other clauses, computed during + * scheduling and used when emitting code. Dependencies expressed as a + * bitfield matching the hardware, except shifted by a clause (the + * shift back to the ISA's off-by-one encoding is worked out when + * emitting clauses) */ + unsigned scoreboard_id; + uint8_t dependencies; - /* See ISA header for description */ - enum bifrost_flow flow_control; + /* See ISA header for description */ + enum bifrost_flow flow_control; - /* Can we prefetch the next clause? Usually it makes sense, except for - * clauses ending in unconditional branches */ - bool next_clause_prefetch; + /* Can we prefetch the next clause? Usually it makes sense, except for + * clauses ending in unconditional branches */ + bool next_clause_prefetch; - /* Assigned data register */ - unsigned staging_register; + /* Assigned data register */ + unsigned staging_register; - /* Corresponds to the usual bit but shifted by a clause */ - bool staging_barrier; + /* Corresponds to the usual bit but shifted by a clause */ + bool staging_barrier; - /* Constants read by this clause. ISA limit. Must satisfy: - * - * constant_count + tuple_count <= 13 - * - * Also implicitly constant_count <= tuple_count since a tuple only - * reads a single constant. - */ - uint64_t constants[8]; - unsigned constant_count; + /* Constants read by this clause. ISA limit. Must satisfy: + * + * constant_count + tuple_count <= 13 + * + * Also implicitly constant_count <= tuple_count since a tuple only + * reads a single constant. + */ + uint64_t constants[8]; + unsigned constant_count; - /* Index of a constant to be PC-relative */ - unsigned pcrel_idx; + /* Index of a constant to be PC-relative */ + unsigned pcrel_idx; - /* Branches encode a constant offset relative to the program counter - * with some magic flags. By convention, if there is a branch, its - * constant will be last. Set this flag to indicate this is required. - */ - bool branch_constant; + /* Branches encode a constant offset relative to the program counter + * with some magic flags. By convention, if there is a branch, its + * constant will be last. Set this flag to indicate this is required. + */ + bool branch_constant; - /* Unique in a clause */ - enum bifrost_message_type message_type; - bi_instr *message; + /* Unique in a clause */ + enum bifrost_message_type message_type; + bi_instr *message; - /* Discard helper threads */ - bool td; + /* Discard helper threads */ + bool td; - /* Should flush-to-zero mode be enabled for this clause? */ - bool ftz; + /* Should flush-to-zero mode be enabled for this clause? */ + bool ftz; } bi_clause; #define BI_NUM_SLOTS 8 /* A model for the state of the scoreboard */ struct bi_scoreboard_state { - /** Bitmap of registers read/written by a slot */ - uint64_t read[BI_NUM_SLOTS]; - uint64_t write[BI_NUM_SLOTS]; + /** Bitmap of registers read/written by a slot */ + uint64_t read[BI_NUM_SLOTS]; + uint64_t write[BI_NUM_SLOTS]; - /* Nonregister dependencies present by a slot */ - uint8_t varying : BI_NUM_SLOTS; - uint8_t memory : BI_NUM_SLOTS; + /* Nonregister dependencies present by a slot */ + uint8_t varying : BI_NUM_SLOTS; + uint8_t memory : BI_NUM_SLOTS; }; typedef struct bi_block { - /* Link to next block. Must be first for mir_get_block */ - struct list_head link; + /* Link to next block. Must be first for mir_get_block */ + struct list_head link; - /* List of instructions emitted for the current block */ - struct list_head instructions; + /* List of instructions emitted for the current block */ + struct list_head instructions; - /* Index of the block in source order */ - unsigned index; + /* Index of the block in source order */ + unsigned index; - /* Control flow graph */ - struct bi_block *successors[2]; - struct util_dynarray predecessors; - bool unconditional_jumps; + /* Control flow graph */ + struct bi_block *successors[2]; + struct util_dynarray predecessors; + bool unconditional_jumps; - /* Per 32-bit word live masks for the block indexed by node */ - uint8_t *live_in; - uint8_t *live_out; + /* Per 32-bit word live masks for the block indexed by node */ + uint8_t *live_in; + uint8_t *live_out; - /* Scalar liveness indexed by SSA index */ - BITSET_WORD *ssa_live_in; - BITSET_WORD *ssa_live_out; + /* Scalar liveness indexed by SSA index */ + BITSET_WORD *ssa_live_in; + BITSET_WORD *ssa_live_out; - /* If true, uses clauses; if false, uses instructions */ - bool scheduled; - struct list_head clauses; /* list of bi_clause */ + /* If true, uses clauses; if false, uses instructions */ + bool scheduled; + struct list_head clauses; /* list of bi_clause */ - /* Post-RA liveness */ - uint64_t reg_live_in, reg_live_out; + /* Post-RA liveness */ + uint64_t reg_live_in, reg_live_out; - /* Scoreboard state at the start/end of block */ - struct bi_scoreboard_state scoreboard_in, scoreboard_out; + /* Scoreboard state at the start/end of block */ + struct bi_scoreboard_state scoreboard_in, scoreboard_out; - /* On Valhall, indicates we need a terminal NOP to implement jumps to - * the end of the shader. - */ - bool needs_nop; + /* On Valhall, indicates we need a terminal NOP to implement jumps to + * the end of the shader. + */ + bool needs_nop; - /* Flags available for pass-internal use */ - uint8_t pass_flags; + /* Flags available for pass-internal use */ + uint8_t pass_flags; } bi_block; static inline unsigned bi_num_successors(bi_block *block) { - STATIC_ASSERT(ARRAY_SIZE(block->successors) == 2); - assert(block->successors[0] || !block->successors[1]); + STATIC_ASSERT(ARRAY_SIZE(block->successors) == 2); + assert(block->successors[0] || !block->successors[1]); - if (block->successors[1]) - return 2; - else if (block->successors[0]) - return 1; - else - return 0; + if (block->successors[1]) + return 2; + else if (block->successors[0]) + return 1; + else + return 0; } static inline unsigned bi_num_predecessors(bi_block *block) { - return util_dynarray_num_elements(&block->predecessors, bi_block *); + return util_dynarray_num_elements(&block->predecessors, bi_block *); } static inline bi_block * bi_start_block(struct list_head *blocks) { - bi_block *first = list_first_entry(blocks, bi_block, link); - assert(bi_num_predecessors(first) == 0); - return first; + bi_block *first = list_first_entry(blocks, bi_block, link); + assert(bi_num_predecessors(first) == 0); + return first; } static inline bi_block * bi_exit_block(struct list_head *blocks) { - bi_block *last = list_last_entry(blocks, bi_block, link); - assert(bi_num_successors(last) == 0); - return last; + bi_block *last = list_last_entry(blocks, bi_block, link); + assert(bi_num_successors(last) == 0); + return last; } static inline void bi_block_add_successor(bi_block *block, bi_block *successor) { - assert(block != NULL && successor != NULL); + assert(block != NULL && successor != NULL); - /* Cull impossible edges */ - if (block->unconditional_jumps) - return; + /* Cull impossible edges */ + if (block->unconditional_jumps) + return; - for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { - if (block->successors[i]) { - if (block->successors[i] == successor) - return; - else - continue; - } + for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { + if (block->successors[i]) { + if (block->successors[i] == successor) + return; + else + continue; + } - block->successors[i] = successor; - util_dynarray_append(&successor->predecessors, bi_block *, block); - return; - } + block->successors[i] = successor; + util_dynarray_append(&successor->predecessors, bi_block *, block); + return; + } - unreachable("Too many successors"); + unreachable("Too many successors"); } /* Subset of pan_shader_info needed per-variant, in order to support IDVS */ struct bi_shader_info { - struct panfrost_ubo_push *push; - struct bifrost_shader_info *bifrost; - struct panfrost_sysvals *sysvals; - unsigned tls_size; - unsigned work_reg_count; - unsigned push_offset; + struct panfrost_ubo_push *push; + struct bifrost_shader_info *bifrost; + struct panfrost_sysvals *sysvals; + unsigned tls_size; + unsigned work_reg_count; + unsigned push_offset; }; /* State of index-driven vertex shading for current shader */ enum bi_idvs_mode { - /* IDVS not in use */ - BI_IDVS_NONE = 0, + /* IDVS not in use */ + BI_IDVS_NONE = 0, - /* IDVS in use. Compiling a position shader */ - BI_IDVS_POSITION = 1, + /* IDVS in use. Compiling a position shader */ + BI_IDVS_POSITION = 1, - /* IDVS in use. Compiling a varying shader */ - BI_IDVS_VARYING = 2, + /* IDVS in use. Compiling a varying shader */ + BI_IDVS_VARYING = 2, }; typedef struct { - const struct panfrost_compile_inputs *inputs; - nir_shader *nir; - struct bi_shader_info info; - gl_shader_stage stage; - struct list_head blocks; /* list of bi_block */ - struct hash_table_u64 *sysval_to_id; - uint32_t quirks; - unsigned arch; - enum bi_idvs_mode idvs; - unsigned num_blocks; + const struct panfrost_compile_inputs *inputs; + nir_shader *nir; + struct bi_shader_info info; + gl_shader_stage stage; + struct list_head blocks; /* list of bi_block */ + struct hash_table_u64 *sysval_to_id; + uint32_t quirks; + unsigned arch; + enum bi_idvs_mode idvs; + unsigned num_blocks; - /* In any graphics shader, whether the "IDVS with memory - * allocation" flow is used. This affects how varyings are loaded and - * stored. Ignore for compute. - */ - bool malloc_idvs; + /* In any graphics shader, whether the "IDVS with memory + * allocation" flow is used. This affects how varyings are loaded and + * stored. Ignore for compute. + */ + bool malloc_idvs; - /* During NIR->BIR */ - bi_block *current_block; - bi_block *after_block; - bi_block *break_block; - bi_block *continue_block; - bi_block **indexed_nir_blocks; - bool emitted_atest; + /* During NIR->BIR */ + bi_block *current_block; + bi_block *after_block; + bi_block *break_block; + bi_block *continue_block; + bi_block **indexed_nir_blocks; + bool emitted_atest; - /* During NIR->BIR, the coverage bitmap. If this is NULL, the default - * coverage bitmap should be source from preloaded register r60. This is - * written by ATEST and ZS_EMIT - */ - bi_index coverage; + /* During NIR->BIR, the coverage bitmap. If this is NULL, the default + * coverage bitmap should be source from preloaded register r60. This is + * written by ATEST and ZS_EMIT + */ + bi_index coverage; - /* During NIR->BIR, table of preloaded registers, or NULL if never - * preloaded. - */ - bi_index preloaded[64]; + /* During NIR->BIR, table of preloaded registers, or NULL if never + * preloaded. + */ + bi_index preloaded[64]; - /* For creating temporaries */ - unsigned ssa_alloc; - unsigned reg_alloc; + /* For creating temporaries */ + unsigned ssa_alloc; + unsigned reg_alloc; - /* Mask of UBOs that need to be uploaded */ - uint32_t ubo_mask; + /* Mask of UBOs that need to be uploaded */ + uint32_t ubo_mask; - /* During instruction selection, map from vector bi_index to its scalar - * components, populated by a split. - */ - struct hash_table_u64 *allocated_vec; + /* During instruction selection, map from vector bi_index to its scalar + * components, populated by a split. + */ + struct hash_table_u64 *allocated_vec; - /* Stats for shader-db */ - unsigned loop_count; - unsigned spills; - unsigned fills; + /* Stats for shader-db */ + unsigned loop_count; + unsigned spills; + unsigned fills; } bi_context; static inline void bi_remove_instruction(bi_instr *ins) { - list_del(&ins->link); + list_del(&ins->link); } enum bir_fau { - BIR_FAU_ZERO = 0, - BIR_FAU_LANE_ID = 1, - BIR_FAU_WARP_ID = 2, - BIR_FAU_CORE_ID = 3, - BIR_FAU_FB_EXTENT = 4, - BIR_FAU_ATEST_PARAM = 5, - BIR_FAU_SAMPLE_POS_ARRAY = 6, - BIR_FAU_BLEND_0 = 8, - /* blend descs 1 - 7 */ - BIR_FAU_TYPE_MASK = 15, + BIR_FAU_ZERO = 0, + BIR_FAU_LANE_ID = 1, + BIR_FAU_WARP_ID = 2, + BIR_FAU_CORE_ID = 3, + BIR_FAU_FB_EXTENT = 4, + BIR_FAU_ATEST_PARAM = 5, + BIR_FAU_SAMPLE_POS_ARRAY = 6, + BIR_FAU_BLEND_0 = 8, + /* blend descs 1 - 7 */ + BIR_FAU_TYPE_MASK = 15, - /* Valhall only */ - BIR_FAU_TLS_PTR = 16, - BIR_FAU_WLS_PTR = 17, - BIR_FAU_PROGRAM_COUNTER = 18, + /* Valhall only */ + BIR_FAU_TLS_PTR = 16, + BIR_FAU_WLS_PTR = 17, + BIR_FAU_PROGRAM_COUNTER = 18, - BIR_FAU_UNIFORM = (1 << 7), - /* Look up table on Valhall */ - BIR_FAU_IMMEDIATE = (1 << 8), + BIR_FAU_UNIFORM = (1 << 7), + /* Look up table on Valhall */ + BIR_FAU_IMMEDIATE = (1 << 8), }; static inline bi_index bi_fau(enum bir_fau value, bool hi) { - return (bi_index) { - .value = value, - .swizzle = BI_SWIZZLE_H01, - .offset = hi ? 1u : 0u, - .type = BI_INDEX_FAU, - }; + return (bi_index){ + .value = value, + .swizzle = BI_SWIZZLE_H01, + .offset = hi ? 1u : 0u, + .type = BI_INDEX_FAU, + }; } /* @@ -918,8 +926,7 @@ bi_fau(enum bir_fau value, bool hi) static inline bi_index va_lut(unsigned index) { - return bi_fau((enum bir_fau) (BIR_FAU_IMMEDIATE | (index >> 1)), - index & 1); + return bi_fau((enum bir_fau)(BIR_FAU_IMMEDIATE | (index >> 1)), index & 1); } /* @@ -930,13 +937,13 @@ va_lut(unsigned index) static inline bi_index va_zero_lut() { - return va_lut(0); + return va_lut(0); } static inline bi_index bi_temp(bi_context *ctx) { - return bi_get_index(ctx->ssa_alloc++); + return bi_get_index(ctx->ssa_alloc++); } /* Inline constants automatically, will be lowered out by bi_lower_fau where a @@ -946,113 +953,108 @@ bi_temp(bi_context *ctx) static inline bi_index bi_src_index(nir_src *src) { - if (nir_src_is_const(*src) && nir_src_bit_size(*src) <= 32) { - return bi_imm_u32(nir_src_as_uint(*src)); - } else { - assert(src->is_ssa); - return bi_get_index(src->ssa->index); - } + if (nir_src_is_const(*src) && nir_src_bit_size(*src) <= 32) { + return bi_imm_u32(nir_src_as_uint(*src)); + } else { + assert(src->is_ssa); + return bi_get_index(src->ssa->index); + } } static inline bi_index bi_dest_index(nir_dest *dst) { - assert(dst->is_ssa); - return bi_get_index(dst->ssa.index); + assert(dst->is_ssa); + return bi_get_index(dst->ssa.index); } /* Iterators for Bifrost IR */ -#define bi_foreach_block(ctx, v) \ - list_for_each_entry(bi_block, v, &ctx->blocks, link) +#define bi_foreach_block(ctx, v) \ + list_for_each_entry(bi_block, v, &ctx->blocks, link) -#define bi_foreach_block_rev(ctx, v) \ - list_for_each_entry_rev(bi_block, v, &ctx->blocks, link) +#define bi_foreach_block_rev(ctx, v) \ + list_for_each_entry_rev(bi_block, v, &ctx->blocks, link) -#define bi_foreach_block_from(ctx, from, v) \ - list_for_each_entry_from(bi_block, v, from, &ctx->blocks, link) +#define bi_foreach_block_from(ctx, from, v) \ + list_for_each_entry_from(bi_block, v, from, &ctx->blocks, link) -#define bi_foreach_block_from_rev(ctx, from, v) \ - list_for_each_entry_from_rev(bi_block, v, from, &ctx->blocks, link) +#define bi_foreach_block_from_rev(ctx, from, v) \ + list_for_each_entry_from_rev(bi_block, v, from, &ctx->blocks, link) -#define bi_foreach_instr_in_block(block, v) \ - list_for_each_entry(bi_instr, v, &(block)->instructions, link) +#define bi_foreach_instr_in_block(block, v) \ + list_for_each_entry(bi_instr, v, &(block)->instructions, link) -#define bi_foreach_instr_in_block_rev(block, v) \ - list_for_each_entry_rev(bi_instr, v, &(block)->instructions, link) +#define bi_foreach_instr_in_block_rev(block, v) \ + list_for_each_entry_rev(bi_instr, v, &(block)->instructions, link) -#define bi_foreach_instr_in_block_safe(block, v) \ - list_for_each_entry_safe(bi_instr, v, &(block)->instructions, link) +#define bi_foreach_instr_in_block_safe(block, v) \ + list_for_each_entry_safe(bi_instr, v, &(block)->instructions, link) -#define bi_foreach_instr_in_block_safe_rev(block, v) \ - list_for_each_entry_safe_rev(bi_instr, v, &(block)->instructions, link) +#define bi_foreach_instr_in_block_safe_rev(block, v) \ + list_for_each_entry_safe_rev(bi_instr, v, &(block)->instructions, link) -#define bi_foreach_instr_in_block_from(block, v, from) \ - list_for_each_entry_from(bi_instr, v, from, &(block)->instructions, link) +#define bi_foreach_instr_in_block_from(block, v, from) \ + list_for_each_entry_from(bi_instr, v, from, &(block)->instructions, link) -#define bi_foreach_instr_in_block_from_rev(block, v, from) \ - list_for_each_entry_from_rev(bi_instr, v, from, &(block)->instructions, link) +#define bi_foreach_instr_in_block_from_rev(block, v, from) \ + list_for_each_entry_from_rev(bi_instr, v, from, &(block)->instructions, link) -#define bi_foreach_clause_in_block(block, v) \ - list_for_each_entry(bi_clause, v, &(block)->clauses, link) +#define bi_foreach_clause_in_block(block, v) \ + list_for_each_entry(bi_clause, v, &(block)->clauses, link) -#define bi_foreach_clause_in_block_rev(block, v) \ - list_for_each_entry_rev(bi_clause, v, &(block)->clauses, link) +#define bi_foreach_clause_in_block_rev(block, v) \ + list_for_each_entry_rev(bi_clause, v, &(block)->clauses, link) -#define bi_foreach_clause_in_block_safe(block, v) \ - list_for_each_entry_safe(bi_clause, v, &(block)->clauses, link) +#define bi_foreach_clause_in_block_safe(block, v) \ + list_for_each_entry_safe(bi_clause, v, &(block)->clauses, link) -#define bi_foreach_clause_in_block_from(block, v, from) \ - list_for_each_entry_from(bi_clause, v, from, &(block)->clauses, link) +#define bi_foreach_clause_in_block_from(block, v, from) \ + list_for_each_entry_from(bi_clause, v, from, &(block)->clauses, link) -#define bi_foreach_clause_in_block_from_rev(block, v, from) \ - list_for_each_entry_from_rev(bi_clause, v, from, &(block)->clauses, link) +#define bi_foreach_clause_in_block_from_rev(block, v, from) \ + list_for_each_entry_from_rev(bi_clause, v, from, &(block)->clauses, link) -#define bi_foreach_instr_global(ctx, v) \ - bi_foreach_block(ctx, v_block) \ - bi_foreach_instr_in_block(v_block, v) +#define bi_foreach_instr_global(ctx, v) \ + bi_foreach_block(ctx, v_block) \ + bi_foreach_instr_in_block(v_block, v) -#define bi_foreach_instr_global_rev(ctx, v) \ - bi_foreach_block_rev(ctx, v_block) \ - bi_foreach_instr_in_block_rev(v_block, v) +#define bi_foreach_instr_global_rev(ctx, v) \ + bi_foreach_block_rev(ctx, v_block) \ + bi_foreach_instr_in_block_rev(v_block, v) -#define bi_foreach_instr_global_safe(ctx, v) \ - bi_foreach_block(ctx, v_block) \ - bi_foreach_instr_in_block_safe(v_block, v) +#define bi_foreach_instr_global_safe(ctx, v) \ + bi_foreach_block(ctx, v_block) \ + bi_foreach_instr_in_block_safe(v_block, v) -#define bi_foreach_instr_global_rev_safe(ctx, v) \ - bi_foreach_block_rev(ctx, v_block) \ - bi_foreach_instr_in_block_rev_safe(v_block, v) +#define bi_foreach_instr_global_rev_safe(ctx, v) \ + bi_foreach_block_rev(ctx, v_block) \ + bi_foreach_instr_in_block_rev_safe(v_block, v) -#define bi_foreach_instr_in_tuple(tuple, v) \ - for (bi_instr *v = (tuple)->fma ?: (tuple)->add; \ - v != NULL; \ - v = (v == (tuple)->add) ? NULL : (tuple)->add) +#define bi_foreach_instr_in_tuple(tuple, v) \ + for (bi_instr *v = (tuple)->fma ?: (tuple)->add; v != NULL; \ + v = (v == (tuple)->add) ? NULL : (tuple)->add) -#define bi_foreach_successor(blk, v) \ - bi_block *v; \ - bi_block **_v; \ - for (_v = &blk->successors[0], \ - v = *_v; \ - v != NULL && _v < &blk->successors[2]; \ - _v++, v = *_v) \ +#define bi_foreach_successor(blk, v) \ + bi_block *v; \ + bi_block **_v; \ + for (_v = &blk->successors[0], v = *_v; \ + v != NULL && _v < &blk->successors[2]; _v++, v = *_v) -#define bi_foreach_predecessor(blk, v) \ - util_dynarray_foreach(&(blk)->predecessors, bi_block *, v) +#define bi_foreach_predecessor(blk, v) \ + util_dynarray_foreach(&(blk)->predecessors, bi_block *, v) -#define bi_foreach_src(ins, v) \ - for (unsigned v = 0; v < ins->nr_srcs; ++v) +#define bi_foreach_src(ins, v) for (unsigned v = 0; v < ins->nr_srcs; ++v) -#define bi_foreach_dest(ins, v) \ - for (unsigned v = 0; v < ins->nr_dests; ++v) +#define bi_foreach_dest(ins, v) for (unsigned v = 0; v < ins->nr_dests; ++v) -#define bi_foreach_ssa_src(ins, v) \ - for (unsigned v = 0; v < ins->nr_srcs; ++v) \ - if (ins->src[v].type == BI_INDEX_NORMAL) +#define bi_foreach_ssa_src(ins, v) \ + for (unsigned v = 0; v < ins->nr_srcs; ++v) \ + if (ins->src[v].type == BI_INDEX_NORMAL) -#define bi_foreach_instr_and_src_in_tuple(tuple, ins, s) \ - bi_foreach_instr_in_tuple(tuple, ins) \ - bi_foreach_src(ins, s) +#define bi_foreach_instr_and_src_in_tuple(tuple, ins, s) \ + bi_foreach_instr_in_tuple(tuple, ins) \ + bi_foreach_src(ins, s) /* * Find the index of a predecessor, used as the implicit order of phi sources. @@ -1060,39 +1062,40 @@ bi_dest_index(nir_dest *dst) static inline unsigned bi_predecessor_index(bi_block *succ, bi_block *pred) { - unsigned index = 0; + unsigned index = 0; - bi_foreach_predecessor(succ, x) { - if (*x == pred) return index; + bi_foreach_predecessor(succ, x) { + if (*x == pred) + return index; - index++; - } + index++; + } - unreachable("Invalid predecessor"); + unreachable("Invalid predecessor"); } static inline bi_instr * bi_prev_op(bi_instr *ins) { - return list_last_entry(&(ins->link), bi_instr, link); + return list_last_entry(&(ins->link), bi_instr, link); } static inline bi_instr * bi_next_op(bi_instr *ins) { - return list_first_entry(&(ins->link), bi_instr, link); + return list_first_entry(&(ins->link), bi_instr, link); } static inline bi_block * bi_next_block(bi_block *block) { - return list_first_entry(&(block->link), bi_block, link); + return list_first_entry(&(block->link), bi_block, link); } static inline bi_block * bi_entry_block(bi_context *ctx) { - return list_first_entry(&ctx->blocks, bi_block, link); + return list_first_entry(&ctx->blocks, bi_block, link); } /* BIR manipulation */ @@ -1102,7 +1105,7 @@ unsigned bi_count_read_registers(const bi_instr *ins, unsigned src); unsigned bi_count_write_registers(const bi_instr *ins, unsigned dest); bool bi_is_regfmt_16(enum bi_register_format fmt); unsigned bi_writemask(const bi_instr *ins, unsigned dest); -bi_clause * bi_next_clause(bi_context *ctx, bi_block *block, bi_clause *clause); +bi_clause *bi_next_clause(bi_context *ctx, bi_block *block, bi_clause *clause); bool bi_side_effects(const bi_instr *I); bool bi_reconverge_branches(bi_block *block); @@ -1155,8 +1158,16 @@ bool bi_reads_t(bi_instr *ins, unsigned src); bool bi_validate_initialization(bi_context *ctx); void bi_validate(bi_context *ctx, const char *after_str); #else -static inline bool bi_validate_initialization(UNUSED bi_context *ctx) { return true; } -static inline void bi_validate(UNUSED bi_context *ctx, UNUSED const char *after_str) { return; } +static inline bool +bi_validate_initialization(UNUSED bi_context *ctx) +{ + return true; +} +static inline void +bi_validate(UNUSED bi_context *ctx, UNUSED const char *after_str) +{ + return; +} #endif uint32_t bi_fold_constant(bi_instr *I, bool *unsupported); @@ -1181,10 +1192,9 @@ bool bi_ec0_packed(unsigned tuple_count); static inline bool bi_is_terminal_block(bi_block *block) { - return (block == NULL) || - (list_is_empty(&block->instructions) && - bi_is_terminal_block(block->successors[0]) && - bi_is_terminal_block(block->successors[1])); + return (block == NULL) || (list_is_empty(&block->instructions) && + bi_is_terminal_block(block->successors[0]) && + bi_is_terminal_block(block->successors[1])); } /* Code emit */ @@ -1194,124 +1204,102 @@ unsigned bi_pack(bi_context *ctx, struct util_dynarray *emission); void bi_pack_valhall(bi_context *ctx, struct util_dynarray *emission); struct bi_packed_tuple { - uint64_t lo; - uint64_t hi; + uint64_t lo; + uint64_t hi; }; uint8_t bi_pack_literal(enum bi_clause_subword literal); -uint8_t -bi_pack_upper(enum bi_clause_subword upper, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count); -uint64_t -bi_pack_tuple_bits(enum bi_clause_subword idx, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count, - unsigned offset, unsigned nbits); +uint8_t bi_pack_upper(enum bi_clause_subword upper, + struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count); +uint64_t bi_pack_tuple_bits(enum bi_clause_subword idx, + struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count, unsigned offset, + unsigned nbits); -uint8_t -bi_pack_sync(enum bi_clause_subword t1, - enum bi_clause_subword t2, - enum bi_clause_subword t3, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count, - bool z); +uint8_t bi_pack_sync(enum bi_clause_subword t1, enum bi_clause_subword t2, + enum bi_clause_subword t3, struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count, bool z); -void -bi_pack_format(struct util_dynarray *emission, - unsigned index, - struct bi_packed_tuple *tuples, - ASSERTED unsigned tuple_count, - uint64_t header, uint64_t ec0, - unsigned m0, bool z); +void bi_pack_format(struct util_dynarray *emission, unsigned index, + struct bi_packed_tuple *tuples, + ASSERTED unsigned tuple_count, uint64_t header, + uint64_t ec0, unsigned m0, bool z); -unsigned bi_pack_fma(bi_instr *I, - enum bifrost_packed_src src0, - enum bifrost_packed_src src1, - enum bifrost_packed_src src2, - enum bifrost_packed_src src3); -unsigned bi_pack_add(bi_instr *I, - enum bifrost_packed_src src0, - enum bifrost_packed_src src1, - enum bifrost_packed_src src2, - enum bifrost_packed_src src3); +unsigned bi_pack_fma(bi_instr *I, enum bifrost_packed_src src0, + enum bifrost_packed_src src1, enum bifrost_packed_src src2, + enum bifrost_packed_src src3); +unsigned bi_pack_add(bi_instr *I, enum bifrost_packed_src src0, + enum bifrost_packed_src src1, enum bifrost_packed_src src2, + enum bifrost_packed_src src3); /* Like in NIR, for use with the builder */ enum bi_cursor_option { - bi_cursor_after_block, - bi_cursor_before_instr, - bi_cursor_after_instr + bi_cursor_after_block, + bi_cursor_before_instr, + bi_cursor_after_instr }; typedef struct { - enum bi_cursor_option option; + enum bi_cursor_option option; - union { - bi_block *block; - bi_instr *instr; - }; + union { + bi_block *block; + bi_instr *instr; + }; } bi_cursor; static inline bi_cursor bi_after_block(bi_block *block) { - return (bi_cursor) { - .option = bi_cursor_after_block, - .block = block - }; + return (bi_cursor){.option = bi_cursor_after_block, .block = block}; } static inline bi_cursor bi_before_instr(bi_instr *instr) { - return (bi_cursor) { - .option = bi_cursor_before_instr, - .instr = instr - }; + return (bi_cursor){.option = bi_cursor_before_instr, .instr = instr}; } static inline bi_cursor bi_after_instr(bi_instr *instr) { - return (bi_cursor) { - .option = bi_cursor_after_instr, - .instr = instr - }; + return (bi_cursor){.option = bi_cursor_after_instr, .instr = instr}; } static inline bi_cursor bi_after_block_logical(bi_block *block) { - if (list_is_empty(&block->instructions)) - return bi_after_block(block); + if (list_is_empty(&block->instructions)) + return bi_after_block(block); - bi_instr *last = list_last_entry(&block->instructions, bi_instr, link); - assert(last != NULL); + bi_instr *last = list_last_entry(&block->instructions, bi_instr, link); + assert(last != NULL); - if (last->branch_target) - return bi_before_instr(last); - else - return bi_after_block(block); + if (last->branch_target) + return bi_before_instr(last); + else + return bi_after_block(block); } static inline bi_cursor bi_before_nonempty_block(bi_block *block) { - bi_instr *I = list_first_entry(&block->instructions, bi_instr, link); - assert(I != NULL); + bi_instr *I = list_first_entry(&block->instructions, bi_instr, link); + assert(I != NULL); - return bi_before_instr(I); + return bi_before_instr(I); } static inline bi_cursor bi_before_block(bi_block *block) { - if (list_is_empty(&block->instructions)) - return bi_after_block(block); - else - return bi_before_nonempty_block(block); + if (list_is_empty(&block->instructions)) + return bi_after_block(block); + else + return bi_before_nonempty_block(block); } /* Invariant: a tuple must be nonempty UNLESS it is the last tuple of a clause, @@ -1320,80 +1308,79 @@ bi_before_block(bi_block *block) ATTRIBUTE_RETURNS_NONNULL static inline bi_instr * bi_first_instr_in_tuple(bi_tuple *tuple) { - bi_instr *instr = tuple->fma ?: tuple->add; - assert(instr != NULL); - return instr; + bi_instr *instr = tuple->fma ?: tuple->add; + assert(instr != NULL); + return instr; } ATTRIBUTE_RETURNS_NONNULL static inline bi_instr * bi_first_instr_in_clause(bi_clause *clause) { - return bi_first_instr_in_tuple(&clause->tuples[0]); + return bi_first_instr_in_tuple(&clause->tuples[0]); } ATTRIBUTE_RETURNS_NONNULL static inline bi_instr * bi_last_instr_in_clause(bi_clause *clause) { - bi_tuple tuple = clause->tuples[clause->tuple_count - 1]; - bi_instr *instr = tuple.add ?: tuple.fma; + bi_tuple tuple = clause->tuples[clause->tuple_count - 1]; + bi_instr *instr = tuple.add ?: tuple.fma; - if (!instr) { - assert(clause->tuple_count >= 2); - tuple = clause->tuples[clause->tuple_count - 2]; - instr = tuple.add ?: tuple.fma; - } + if (!instr) { + assert(clause->tuple_count >= 2); + tuple = clause->tuples[clause->tuple_count - 2]; + instr = tuple.add ?: tuple.fma; + } - assert(instr != NULL); - return instr; + assert(instr != NULL); + return instr; } /* Implemented by expanding bi_foreach_instr_in_block_from(_rev) with the start * (end) of the clause and adding a condition for the clause boundary */ -#define bi_foreach_instr_in_clause(block, clause, pos) \ - for (bi_instr *pos = list_entry(bi_first_instr_in_clause(clause), bi_instr, link); \ - (&pos->link != &(block)->instructions) \ - && (pos != bi_next_op(bi_last_instr_in_clause(clause))); \ - pos = list_entry(pos->link.next, bi_instr, link)) +#define bi_foreach_instr_in_clause(block, clause, pos) \ + for (bi_instr *pos = \ + list_entry(bi_first_instr_in_clause(clause), bi_instr, link); \ + (&pos->link != &(block)->instructions) && \ + (pos != bi_next_op(bi_last_instr_in_clause(clause))); \ + pos = list_entry(pos->link.next, bi_instr, link)) -#define bi_foreach_instr_in_clause_rev(block, clause, pos) \ - for (bi_instr *pos = list_entry(bi_last_instr_in_clause(clause), bi_instr, link); \ - (&pos->link != &(block)->instructions) \ - && pos != bi_prev_op(bi_first_instr_in_clause(clause)); \ - pos = list_entry(pos->link.prev, bi_instr, link)) +#define bi_foreach_instr_in_clause_rev(block, clause, pos) \ + for (bi_instr *pos = \ + list_entry(bi_last_instr_in_clause(clause), bi_instr, link); \ + (&pos->link != &(block)->instructions) && \ + pos != bi_prev_op(bi_first_instr_in_clause(clause)); \ + pos = list_entry(pos->link.prev, bi_instr, link)) static inline bi_cursor bi_before_clause(bi_clause *clause) { - return bi_before_instr(bi_first_instr_in_clause(clause)); + return bi_before_instr(bi_first_instr_in_clause(clause)); } static inline bi_cursor bi_before_tuple(bi_tuple *tuple) { - return bi_before_instr(bi_first_instr_in_tuple(tuple)); + return bi_before_instr(bi_first_instr_in_tuple(tuple)); } static inline bi_cursor bi_after_clause(bi_clause *clause) { - return bi_after_instr(bi_last_instr_in_clause(clause)); + return bi_after_instr(bi_last_instr_in_clause(clause)); } /* IR builder in terms of cursor infrastructure */ typedef struct { - bi_context *shader; - bi_cursor cursor; + bi_context *shader; + bi_cursor cursor; } bi_builder; static inline bi_builder bi_init_builder(bi_context *ctx, bi_cursor cursor) { - return (bi_builder) { - .shader = ctx, - .cursor = cursor - }; + return (bi_builder){.shader = ctx, .cursor = cursor}; } /* Insert an instruction at the cursor and move the cursor */ @@ -1401,26 +1388,26 @@ bi_init_builder(bi_context *ctx, bi_cursor cursor) static inline void bi_builder_insert(bi_cursor *cursor, bi_instr *I) { - switch (cursor->option) { - case bi_cursor_after_instr: - list_add(&I->link, &cursor->instr->link); - cursor->instr = I; - return; + switch (cursor->option) { + case bi_cursor_after_instr: + list_add(&I->link, &cursor->instr->link); + cursor->instr = I; + return; - case bi_cursor_after_block: - list_addtail(&I->link, &cursor->block->instructions); - cursor->option = bi_cursor_after_instr; - cursor->instr = I; - return; + case bi_cursor_after_block: + list_addtail(&I->link, &cursor->block->instructions); + cursor->option = bi_cursor_after_instr; + cursor->instr = I; + return; - case bi_cursor_before_instr: - list_addtail(&I->link, &cursor->instr->link); - cursor->option = bi_cursor_after_instr; - cursor->instr = I; - return; - } + case bi_cursor_before_instr: + list_addtail(&I->link, &cursor->instr->link); + cursor->option = bi_cursor_after_instr; + cursor->instr = I; + return; + } - unreachable("Invalid cursor option"); + unreachable("Invalid cursor option"); } bi_instr *bi_csel_from_mux(bi_builder *b, const bi_instr *I, bool must_sign); @@ -1429,19 +1416,19 @@ bi_instr *bi_csel_from_mux(bi_builder *b, const bi_instr *I, bool must_sign); static inline bi_index bi_dontcare(bi_builder *b) { - if (b->shader->arch >= 9) - return bi_zero(); - else - return bi_passthrough(BIFROST_SRC_FAU_HI); + if (b->shader->arch >= 9) + return bi_zero(); + else + return bi_passthrough(BIFROST_SRC_FAU_HI); } -#define bi_worklist_init(ctx, w) u_worklist_init(w, ctx->num_blocks, ctx) +#define bi_worklist_init(ctx, w) u_worklist_init(w, ctx->num_blocks, ctx) #define bi_worklist_push_head(w, block) u_worklist_push_head(w, block, index) #define bi_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index) -#define bi_worklist_peek_head(w) u_worklist_peek_head(w, bi_block, index) -#define bi_worklist_pop_head(w) u_worklist_pop_head( w, bi_block, index) -#define bi_worklist_peek_tail(w) u_worklist_peek_tail(w, bi_block, index) -#define bi_worklist_pop_tail(w) u_worklist_pop_tail( w, bi_block, index) +#define bi_worklist_peek_head(w) u_worklist_peek_head(w, bi_block, index) +#define bi_worklist_pop_head(w) u_worklist_pop_head(w, bi_block, index) +#define bi_worklist_peek_tail(w) u_worklist_peek_tail(w, bi_block, index) +#define bi_worklist_pop_tail(w) u_worklist_pop_tail(w, bi_block, index) /* NIR passes */ diff --git a/src/panfrost/bifrost/disassemble.c b/src/panfrost/bifrost/disassemble.c index 1bc98e40596..5a3791efad0 100644 --- a/src/panfrost/bifrost/disassemble.c +++ b/src/panfrost/bifrost/disassemble.c @@ -23,268 +23,276 @@ * SOFTWARE. */ -#include -#include -#include #include #include +#include +#include +#include #include -#include "bifrost.h" -#include "disassemble.h" -#include "bi_print_common.h" #include "util/compiler.h" #include "util/macros.h" +#include "bi_print_common.h" +#include "bifrost.h" +#include "disassemble.h" // return bits (high, lo] -static uint64_t bits(uint32_t word, unsigned lo, unsigned high) +static uint64_t +bits(uint32_t word, unsigned lo, unsigned high) { - if (high == 32) - return word >> lo; - return (word & ((1 << high) - 1)) >> lo; + if (high == 32) + return word >> lo; + return (word & ((1 << high) - 1)) >> lo; } // each of these structs represents an instruction that's dispatched in one // cycle. Note that these instructions are packed in funny ways within the // clause, hence the need for a separate struct. struct bifrost_alu_inst { - uint32_t fma_bits; - uint32_t add_bits; - uint64_t reg_bits; + uint32_t fma_bits; + uint32_t add_bits; + uint64_t reg_bits; }; -static unsigned get_reg0(struct bifrost_regs regs) +static unsigned +get_reg0(struct bifrost_regs regs) { - if (regs.ctrl == 0) - return regs.reg0 | ((regs.reg1 & 0x1) << 5); + if (regs.ctrl == 0) + return regs.reg0 | ((regs.reg1 & 0x1) << 5); - return regs.reg0 <= regs.reg1 ? regs.reg0 : 63 - regs.reg0; + return regs.reg0 <= regs.reg1 ? regs.reg0 : 63 - regs.reg0; } -static unsigned get_reg1(struct bifrost_regs regs) +static unsigned +get_reg1(struct bifrost_regs regs) { - return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1; + return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1; } // this represents the decoded version of the ctrl register field. struct bifrost_reg_ctrl { - bool read_reg0; - bool read_reg1; - struct bifrost_reg_ctrl_23 slot23; + bool read_reg0; + bool read_reg1; + struct bifrost_reg_ctrl_23 slot23; }; -static void dump_header(FILE *fp, struct bifrost_header header, bool verbose) +static void +dump_header(FILE *fp, struct bifrost_header header, bool verbose) { - fprintf(fp, "ds(%u) ", header.dependency_slot); + fprintf(fp, "ds(%u) ", header.dependency_slot); - if (header.staging_barrier) - fprintf(fp, "osrb "); + if (header.staging_barrier) + fprintf(fp, "osrb "); - fprintf(fp, "%s ", bi_flow_control_name(header.flow_control)); + fprintf(fp, "%s ", bi_flow_control_name(header.flow_control)); - if (header.suppress_inf) - fprintf(fp, "inf_suppress "); - if (header.suppress_nan) - fprintf(fp, "nan_suppress "); + if (header.suppress_inf) + fprintf(fp, "inf_suppress "); + if (header.suppress_nan) + fprintf(fp, "nan_suppress "); - if (header.flush_to_zero == BIFROST_FTZ_DX11) - fprintf(fp, "ftz_dx11 "); - else if (header.flush_to_zero == BIFROST_FTZ_ALWAYS) - fprintf(fp, "ftz_hsa "); - if (header.flush_to_zero == BIFROST_FTZ_ABRUPT) - fprintf(fp, "ftz_au "); + if (header.flush_to_zero == BIFROST_FTZ_DX11) + fprintf(fp, "ftz_dx11 "); + else if (header.flush_to_zero == BIFROST_FTZ_ALWAYS) + fprintf(fp, "ftz_hsa "); + if (header.flush_to_zero == BIFROST_FTZ_ABRUPT) + fprintf(fp, "ftz_au "); - assert(!header.zero1); - assert(!header.zero2); + assert(!header.zero1); + assert(!header.zero2); - if (header.float_exceptions == BIFROST_EXCEPTIONS_DISABLED) - fprintf(fp, "fpe_ts "); - else if (header.float_exceptions == BIFROST_EXCEPTIONS_PRECISE_DIVISION) - fprintf(fp, "fpe_pd "); - else if (header.float_exceptions == BIFROST_EXCEPTIONS_PRECISE_SQRT) - fprintf(fp, "fpe_psqr "); + if (header.float_exceptions == BIFROST_EXCEPTIONS_DISABLED) + fprintf(fp, "fpe_ts "); + else if (header.float_exceptions == BIFROST_EXCEPTIONS_PRECISE_DIVISION) + fprintf(fp, "fpe_pd "); + else if (header.float_exceptions == BIFROST_EXCEPTIONS_PRECISE_SQRT) + fprintf(fp, "fpe_psqr "); - if (header.message_type) - fprintf(fp, "%s ", bi_message_type_name(header.message_type)); + if (header.message_type) + fprintf(fp, "%s ", bi_message_type_name(header.message_type)); - if (header.terminate_discarded_threads) - fprintf(fp, "td "); + if (header.terminate_discarded_threads) + fprintf(fp, "td "); - if (header.next_clause_prefetch) - fprintf(fp, "ncph "); + if (header.next_clause_prefetch) + fprintf(fp, "ncph "); - if (header.next_message_type) - fprintf(fp, "next_%s ", bi_message_type_name(header.next_message_type)); - if (header.dependency_wait != 0) { - fprintf(fp, "dwb("); - bool first = true; - for (unsigned i = 0; i < 8; i++) { - if (header.dependency_wait & (1 << i)) { - if (!first) { - fprintf(fp, ", "); - } - fprintf(fp, "%u", i); - first = false; - } - } - fprintf(fp, ") "); - } + if (header.next_message_type) + fprintf(fp, "next_%s ", bi_message_type_name(header.next_message_type)); + if (header.dependency_wait != 0) { + fprintf(fp, "dwb("); + bool first = true; + for (unsigned i = 0; i < 8; i++) { + if (header.dependency_wait & (1 << i)) { + if (!first) { + fprintf(fp, ", "); + } + fprintf(fp, "%u", i); + first = false; + } + } + fprintf(fp, ") "); + } - fprintf(fp, "\n"); + fprintf(fp, "\n"); } -static struct bifrost_reg_ctrl DecodeRegCtrl(FILE *fp, struct bifrost_regs regs, bool first) +static struct bifrost_reg_ctrl +DecodeRegCtrl(FILE *fp, struct bifrost_regs regs, bool first) { - struct bifrost_reg_ctrl decoded = {}; - unsigned ctrl; - if (regs.ctrl == 0) { - ctrl = regs.reg1 >> 2; - decoded.read_reg0 = !(regs.reg1 & 0x2); - decoded.read_reg1 = false; - } else { - ctrl = regs.ctrl; - decoded.read_reg0 = decoded.read_reg1 = true; - } + struct bifrost_reg_ctrl decoded = {}; + unsigned ctrl; + if (regs.ctrl == 0) { + ctrl = regs.reg1 >> 2; + decoded.read_reg0 = !(regs.reg1 & 0x2); + decoded.read_reg1 = false; + } else { + ctrl = regs.ctrl; + decoded.read_reg0 = decoded.read_reg1 = true; + } - /* Modify control based on state */ - if (first) - ctrl = (ctrl & 0x7) | ((ctrl & 0x8) << 1); - else if (regs.reg2 == regs.reg3) - ctrl += 16; + /* Modify control based on state */ + if (first) + ctrl = (ctrl & 0x7) | ((ctrl & 0x8) << 1); + else if (regs.reg2 == regs.reg3) + ctrl += 16; - decoded.slot23 = bifrost_reg_ctrl_lut[ctrl]; - ASSERTED struct bifrost_reg_ctrl_23 reserved = { 0 }; - assert(memcmp(&decoded.slot23, &reserved, sizeof(reserved))); + decoded.slot23 = bifrost_reg_ctrl_lut[ctrl]; + ASSERTED struct bifrost_reg_ctrl_23 reserved = {0}; + assert(memcmp(&decoded.slot23, &reserved, sizeof(reserved))); - return decoded; + return decoded; } -static void dump_regs(FILE *fp, struct bifrost_regs srcs, bool first) +static void +dump_regs(FILE *fp, struct bifrost_regs srcs, bool first) { - struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, srcs, first); - fprintf(fp, " # "); - if (ctrl.read_reg0) - fprintf(fp, "slot 0: r%u ", get_reg0(srcs)); - if (ctrl.read_reg1) - fprintf(fp, "slot 1: r%u ", get_reg1(srcs)); + struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, srcs, first); + fprintf(fp, " # "); + if (ctrl.read_reg0) + fprintf(fp, "slot 0: r%u ", get_reg0(srcs)); + if (ctrl.read_reg1) + fprintf(fp, "slot 1: r%u ", get_reg1(srcs)); - const char *slot3_fma = ctrl.slot23.slot3_fma ? "FMA" : "ADD"; + const char *slot3_fma = ctrl.slot23.slot3_fma ? "FMA" : "ADD"; - if (ctrl.slot23.slot2 == BIFROST_OP_WRITE) - fprintf(fp, "slot 2: r%u (write FMA) ", srcs.reg2); - else if (ctrl.slot23.slot2 == BIFROST_OP_WRITE_LO) - fprintf(fp, "slot 2: r%u (write lo FMA) ", srcs.reg2); - else if (ctrl.slot23.slot2 == BIFROST_OP_WRITE_HI) - fprintf(fp, "slot 2: r%u (write hi FMA) ", srcs.reg2); - else if (ctrl.slot23.slot2 == BIFROST_OP_READ) - fprintf(fp, "slot 2: r%u (read) ", srcs.reg2); + if (ctrl.slot23.slot2 == BIFROST_OP_WRITE) + fprintf(fp, "slot 2: r%u (write FMA) ", srcs.reg2); + else if (ctrl.slot23.slot2 == BIFROST_OP_WRITE_LO) + fprintf(fp, "slot 2: r%u (write lo FMA) ", srcs.reg2); + else if (ctrl.slot23.slot2 == BIFROST_OP_WRITE_HI) + fprintf(fp, "slot 2: r%u (write hi FMA) ", srcs.reg2); + else if (ctrl.slot23.slot2 == BIFROST_OP_READ) + fprintf(fp, "slot 2: r%u (read) ", srcs.reg2); - if (ctrl.slot23.slot3 == BIFROST_OP_WRITE) - fprintf(fp, "slot 3: r%u (write %s) ", srcs.reg3, slot3_fma); - else if (ctrl.slot23.slot3 == BIFROST_OP_WRITE_LO) - fprintf(fp, "slot 3: r%u (write lo %s) ", srcs.reg3, slot3_fma); - else if (ctrl.slot23.slot3 == BIFROST_OP_WRITE_HI) - fprintf(fp, "slot 3: r%u (write hi %s) ", srcs.reg3, slot3_fma); + if (ctrl.slot23.slot3 == BIFROST_OP_WRITE) + fprintf(fp, "slot 3: r%u (write %s) ", srcs.reg3, slot3_fma); + else if (ctrl.slot23.slot3 == BIFROST_OP_WRITE_LO) + fprintf(fp, "slot 3: r%u (write lo %s) ", srcs.reg3, slot3_fma); + else if (ctrl.slot23.slot3 == BIFROST_OP_WRITE_HI) + fprintf(fp, "slot 3: r%u (write hi %s) ", srcs.reg3, slot3_fma); - if (srcs.fau_idx) - fprintf(fp, "fau %X ", srcs.fau_idx); + if (srcs.fau_idx) + fprintf(fp, "fau %X ", srcs.fau_idx); - fprintf(fp, "\n"); + fprintf(fp, "\n"); } static void bi_disasm_dest_mask(FILE *fp, enum bifrost_reg_op op) { - if (op == BIFROST_OP_WRITE_LO) - fprintf(fp, ".h0"); - else if (op == BIFROST_OP_WRITE_HI) - fprintf(fp, ".h1"); + if (op == BIFROST_OP_WRITE_LO) + fprintf(fp, ".h0"); + else if (op == BIFROST_OP_WRITE_HI) + fprintf(fp, ".h1"); } void bi_disasm_dest_fma(FILE *fp, struct bifrost_regs *next_regs, bool last) { - /* If this is the last instruction, next_regs points to the first reg entry. */ - struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, *next_regs, last); - if (ctrl.slot23.slot2 >= BIFROST_OP_WRITE) { - fprintf(fp, "r%u:t0", next_regs->reg2); - bi_disasm_dest_mask(fp, ctrl.slot23.slot2); - } else if (ctrl.slot23.slot3 >= BIFROST_OP_WRITE && ctrl.slot23.slot3_fma) { - fprintf(fp, "r%u:t0", next_regs->reg3); - bi_disasm_dest_mask(fp, ctrl.slot23.slot3); - } else - fprintf(fp, "t0"); + /* If this is the last instruction, next_regs points to the first reg entry. */ + struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, *next_regs, last); + if (ctrl.slot23.slot2 >= BIFROST_OP_WRITE) { + fprintf(fp, "r%u:t0", next_regs->reg2); + bi_disasm_dest_mask(fp, ctrl.slot23.slot2); + } else if (ctrl.slot23.slot3 >= BIFROST_OP_WRITE && ctrl.slot23.slot3_fma) { + fprintf(fp, "r%u:t0", next_regs->reg3); + bi_disasm_dest_mask(fp, ctrl.slot23.slot3); + } else + fprintf(fp, "t0"); } void bi_disasm_dest_add(FILE *fp, struct bifrost_regs *next_regs, bool last) { - /* If this is the last instruction, next_regs points to the first reg entry. */ - struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, *next_regs, last); + /* If this is the last instruction, next_regs points to the first reg entry. */ + struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, *next_regs, last); - if (ctrl.slot23.slot3 >= BIFROST_OP_WRITE && !ctrl.slot23.slot3_fma) { - fprintf(fp, "r%u:t1", next_regs->reg3); - bi_disasm_dest_mask(fp, ctrl.slot23.slot3); - } else - fprintf(fp, "t1"); -} - -static void dump_const_imm(FILE *fp, uint32_t imm) -{ - union { - float f; - uint32_t i; - } fi; - fi.i = imm; - fprintf(fp, "0x%08x /* %f */", imm, fi.f); + if (ctrl.slot23.slot3 >= BIFROST_OP_WRITE && !ctrl.slot23.slot3_fma) { + fprintf(fp, "r%u:t1", next_regs->reg3); + bi_disasm_dest_mask(fp, ctrl.slot23.slot3); + } else + fprintf(fp, "t1"); } static void -dump_pc_imm(FILE *fp, uint64_t imm, unsigned branch_offset, enum bi_constmod mod, bool high32) +dump_const_imm(FILE *fp, uint32_t imm) { - if (mod == BI_CONSTMOD_PC_HI && !high32) { - dump_const_imm(fp, imm); - return; - } + union { + float f; + uint32_t i; + } fi; + fi.i = imm; + fprintf(fp, "0x%08x /* %f */", imm, fi.f); +} - /* 60-bit sign-extend */ - uint64_t zx64 = (imm << 4); - int64_t sx64 = zx64; - sx64 >>= 4; +static void +dump_pc_imm(FILE *fp, uint64_t imm, unsigned branch_offset, + enum bi_constmod mod, bool high32) +{ + if (mod == BI_CONSTMOD_PC_HI && !high32) { + dump_const_imm(fp, imm); + return; + } - /* 28-bit sign extend x 2 */ - uint32_t imm32[2] = { (uint32_t) imm, (uint32_t) (imm >> 32) }; - uint32_t zx32[2] = { imm32[0] << 4, imm32[1] << 4 }; - int32_t sx32[2] = { zx32[0], zx32[1] }; - sx32[0] >>= 4; - sx32[1] >>= 4; + /* 60-bit sign-extend */ + uint64_t zx64 = (imm << 4); + int64_t sx64 = zx64; + sx64 >>= 4; - int64_t offs = 0; + /* 28-bit sign extend x 2 */ + uint32_t imm32[2] = {(uint32_t)imm, (uint32_t)(imm >> 32)}; + uint32_t zx32[2] = {imm32[0] << 4, imm32[1] << 4}; + int32_t sx32[2] = {zx32[0], zx32[1]}; + sx32[0] >>= 4; + sx32[1] >>= 4; - switch (mod) { - case BI_CONSTMOD_PC_LO: - offs = sx64; - break; - case BI_CONSTMOD_PC_HI: - offs = sx32[1]; - break; - case BI_CONSTMOD_PC_LO_HI: - offs = sx32[high32]; - break; - default: - unreachable("Invalid PC modifier"); - } + int64_t offs = 0; - assert((offs & 15) == 0); - fprintf(fp, "clause_%" PRId64, branch_offset + (offs / 16)); + switch (mod) { + case BI_CONSTMOD_PC_LO: + offs = sx64; + break; + case BI_CONSTMOD_PC_HI: + offs = sx32[1]; + break; + case BI_CONSTMOD_PC_LO_HI: + offs = sx32[high32]; + break; + default: + unreachable("Invalid PC modifier"); + } - if (mod == BI_CONSTMOD_PC_LO && high32) - fprintf(fp, " >> 32"); + assert((offs & 15) == 0); + fprintf(fp, "clause_%" PRId64, branch_offset + (offs / 16)); - /* While technically in spec, referencing the current clause as (pc + - * 0) likely indicates an unintended infinite loop */ - if (offs == 0) - fprintf(fp, " /* XXX: likely an infinite loop */"); + if (mod == BI_CONSTMOD_PC_LO && high32) + fprintf(fp, " >> 32"); + + /* While technically in spec, referencing the current clause as (pc + + * 0) likely indicates an unintended infinite loop */ + if (offs == 0) + fprintf(fp, " /* XXX: likely an infinite loop */"); } /* Convert an index to an embedded constant in FAU-RAM to the index of the @@ -293,106 +301,107 @@ dump_pc_imm(FILE *fp, uint64_t imm, unsigned branch_offset, enum bi_constmod mod static unsigned const_fau_to_idx(unsigned fau_value) { - unsigned map[8] = { - ~0, ~0, 4, 5, 0, 1, 2, 3 - }; + unsigned map[8] = {~0, ~0, 4, 5, 0, 1, 2, 3}; - assert(map[fau_value] < 6); - return map[fau_value]; + assert(map[fau_value] < 6); + return map[fau_value]; } -static void dump_fau_src(FILE *fp, struct bifrost_regs srcs, unsigned branch_offset, struct bi_constants *consts, bool high32) +static void +dump_fau_src(FILE *fp, struct bifrost_regs srcs, unsigned branch_offset, + struct bi_constants *consts, bool high32) { - if (srcs.fau_idx & 0x80) { - unsigned uniform = (srcs.fau_idx & 0x7f); - fprintf(fp, "u%u.w%u", uniform, high32); - } else if (srcs.fau_idx >= 0x20) { - unsigned idx = const_fau_to_idx(srcs.fau_idx >> 4); - uint64_t imm = consts->raw[idx]; - imm |= (srcs.fau_idx & 0xf); - if (consts->mods[idx] != BI_CONSTMOD_NONE) - dump_pc_imm(fp, imm, branch_offset, consts->mods[idx], high32); - else if (high32) - dump_const_imm(fp, imm >> 32); - else - dump_const_imm(fp, imm); - } else { - switch (srcs.fau_idx) { - case 0: - fprintf(fp, "#0"); - break; - case 1: - fprintf(fp, "lane_id"); - break; - case 2: - fprintf(fp, "warp_id"); - break; - case 3: - fprintf(fp, "core_id"); - break; - case 4: - fprintf(fp, "framebuffer_size"); - break; - case 5: - fprintf(fp, "atest_datum"); - break; - case 6: - fprintf(fp, "sample"); - break; - case 8: - case 9: - case 10: - case 11: - case 12: - case 13: - case 14: - case 15: - fprintf(fp, "blend_descriptor_%u", (unsigned) srcs.fau_idx - 8); - break; - default: - fprintf(fp, "XXX - reserved%u", (unsigned) srcs.fau_idx); - break; - } + if (srcs.fau_idx & 0x80) { + unsigned uniform = (srcs.fau_idx & 0x7f); + fprintf(fp, "u%u.w%u", uniform, high32); + } else if (srcs.fau_idx >= 0x20) { + unsigned idx = const_fau_to_idx(srcs.fau_idx >> 4); + uint64_t imm = consts->raw[idx]; + imm |= (srcs.fau_idx & 0xf); + if (consts->mods[idx] != BI_CONSTMOD_NONE) + dump_pc_imm(fp, imm, branch_offset, consts->mods[idx], high32); + else if (high32) + dump_const_imm(fp, imm >> 32); + else + dump_const_imm(fp, imm); + } else { + switch (srcs.fau_idx) { + case 0: + fprintf(fp, "#0"); + break; + case 1: + fprintf(fp, "lane_id"); + break; + case 2: + fprintf(fp, "warp_id"); + break; + case 3: + fprintf(fp, "core_id"); + break; + case 4: + fprintf(fp, "framebuffer_size"); + break; + case 5: + fprintf(fp, "atest_datum"); + break; + case 6: + fprintf(fp, "sample"); + break; + case 8: + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + fprintf(fp, "blend_descriptor_%u", (unsigned)srcs.fau_idx - 8); + break; + default: + fprintf(fp, "XXX - reserved%u", (unsigned)srcs.fau_idx); + break; + } - if (high32) - fprintf(fp, ".y"); - else - fprintf(fp, ".x"); - } + if (high32) + fprintf(fp, ".y"); + else + fprintf(fp, ".x"); + } } void -dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, unsigned branch_offset, struct bi_constants *consts, bool isFMA) +dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, + unsigned branch_offset, struct bi_constants *consts, bool isFMA) { - switch (src) { - case 0: - fprintf(fp, "r%u", get_reg0(srcs)); - break; - case 1: - fprintf(fp, "r%u", get_reg1(srcs)); - break; - case 2: - fprintf(fp, "r%u", srcs.reg2); - break; - case 3: - if (isFMA) - fprintf(fp, "#0"); - else - fprintf(fp, "t"); // i.e. the output of FMA this cycle - break; - case 4: - dump_fau_src(fp, srcs, branch_offset, consts, false); - break; - case 5: - dump_fau_src(fp, srcs, branch_offset, consts, true); - break; - case 6: - fprintf(fp, "t0"); - break; - case 7: - fprintf(fp, "t1"); - break; - } + switch (src) { + case 0: + fprintf(fp, "r%u", get_reg0(srcs)); + break; + case 1: + fprintf(fp, "r%u", get_reg1(srcs)); + break; + case 2: + fprintf(fp, "r%u", srcs.reg2); + break; + case 3: + if (isFMA) + fprintf(fp, "#0"); + else + fprintf(fp, "t"); // i.e. the output of FMA this cycle + break; + case 4: + dump_fau_src(fp, srcs, branch_offset, consts, false); + break; + case 5: + dump_fau_src(fp, srcs, branch_offset, consts, true); + break; + case 6: + fprintf(fp, "t0"); + break; + case 7: + fprintf(fp, "t1"); + break; + } } /* Tables for decoding M0, or if M0 == 7, M1 respectively. @@ -403,308 +412,311 @@ dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, unsigned branch_offse */ static const enum bi_constmod M1_table[7][2] = { - { BI_CONSTMOD_NONE, BI_CONSTMOD_NONE }, - { BI_CONSTMOD_PC_LO, BI_CONSTMOD_NONE }, - { BI_CONSTMOD_PC_LO, BI_CONSTMOD_PC_LO }, - { ~0, ~0 }, - { BI_CONSTMOD_PC_HI, BI_CONSTMOD_NONE }, - { BI_CONSTMOD_PC_HI, BI_CONSTMOD_PC_HI }, - { BI_CONSTMOD_PC_LO, BI_CONSTMOD_NONE }, + {BI_CONSTMOD_NONE, BI_CONSTMOD_NONE}, + {BI_CONSTMOD_PC_LO, BI_CONSTMOD_NONE}, + {BI_CONSTMOD_PC_LO, BI_CONSTMOD_PC_LO}, + {~0, ~0}, + {BI_CONSTMOD_PC_HI, BI_CONSTMOD_NONE}, + {BI_CONSTMOD_PC_HI, BI_CONSTMOD_PC_HI}, + {BI_CONSTMOD_PC_LO, BI_CONSTMOD_NONE}, }; static const enum bi_constmod M2_table[4][2] = { - { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_NONE }, - { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_HI }, - { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_LO_HI }, - { BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_HI }, + {BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_NONE}, + {BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_HI}, + {BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_LO_HI}, + {BI_CONSTMOD_PC_LO_HI, BI_CONSTMOD_PC_HI}, }; static void decode_M(enum bi_constmod *mod, unsigned M1, unsigned M2, bool single) { - if (M1 >= 8) { - mod[0] = BI_CONSTMOD_NONE; + if (M1 >= 8) { + mod[0] = BI_CONSTMOD_NONE; - if (!single) - mod[1] = BI_CONSTMOD_NONE; + if (!single) + mod[1] = BI_CONSTMOD_NONE; - return; - } else if (M1 == 7) { - assert(M2 < 4); - memcpy(mod, M2_table[M2], sizeof(*mod) * (single ? 1 : 2)); - } else { - assert(M1 != 3); - memcpy(mod, M1_table[M1], sizeof(*mod) * (single ? 1 : 2)); - } + return; + } else if (M1 == 7) { + assert(M2 < 4); + memcpy(mod, M2_table[M2], sizeof(*mod) * (single ? 1 : 2)); + } else { + assert(M1 != 3); + memcpy(mod, M1_table[M1], sizeof(*mod) * (single ? 1 : 2)); + } } -static void dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose) +static void +dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, + bool verbose) { - // State for a decoded clause - struct bifrost_alu_inst instrs[8] = {}; - struct bi_constants consts = {}; - unsigned num_instrs = 0; - unsigned num_consts = 0; - uint64_t header_bits = 0; + // State for a decoded clause + struct bifrost_alu_inst instrs[8] = {}; + struct bi_constants consts = {}; + unsigned num_instrs = 0; + unsigned num_consts = 0; + uint64_t header_bits = 0; - unsigned i; - for (i = 0; ; i++, words += 4) { - if (verbose) { - fprintf(fp, "# "); - for (int j = 0; j < 4; j++) - fprintf(fp, "%08x ", words[3 - j]); // low bit on the right - fprintf(fp, "\n"); - } - unsigned tag = bits(words[0], 0, 8); + unsigned i; + for (i = 0;; i++, words += 4) { + if (verbose) { + fprintf(fp, "# "); + for (int j = 0; j < 4; j++) + fprintf(fp, "%08x ", words[3 - j]); // low bit on the right + fprintf(fp, "\n"); + } + unsigned tag = bits(words[0], 0, 8); - // speculatively decode some things that are common between many formats, so we can share some code - struct bifrost_alu_inst main_instr = {}; - // 20 bits - main_instr.add_bits = bits(words[2], 2, 32 - 13); - // 23 bits - main_instr.fma_bits = bits(words[1], 11, 32) | bits(words[2], 0, 2) << (32 - 11); - // 35 bits - main_instr.reg_bits = ((uint64_t) bits(words[1], 0, 11)) << 24 | (uint64_t) bits(words[0], 8, 32); + // speculatively decode some things that are common between many formats, + // so we can share some code + struct bifrost_alu_inst main_instr = {}; + // 20 bits + main_instr.add_bits = bits(words[2], 2, 32 - 13); + // 23 bits + main_instr.fma_bits = bits(words[1], 11, 32) | bits(words[2], 0, 2) + << (32 - 11); + // 35 bits + main_instr.reg_bits = ((uint64_t)bits(words[1], 0, 11)) << 24 | + (uint64_t)bits(words[0], 8, 32); - uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60; - uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32; + uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t)words[1] << 28 | + bits(words[2], 0, 4) << 60; + uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t)words[3] << 32; - /* Z-bit */ - bool stop = tag & 0x40; + /* Z-bit */ + bool stop = tag & 0x40; - if (verbose) { - fprintf(fp, "# tag: 0x%02x\n", tag); - } - if (tag & 0x80) { - /* Format 5 or 10 */ - unsigned idx = stop ? 5 : 2; - main_instr.add_bits |= ((tag >> 3) & 0x7) << 17; - instrs[idx + 1] = main_instr; - instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17); - instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10; - consts.raw[0] = bits(words[3], 17, 32) << 4; - } else { - bool done = false; - switch ((tag >> 3) & 0x7) { - case 0x0: - switch (tag & 0x7) { - case 0x3: - /* Format 1 */ - main_instr.add_bits |= bits(words[3], 29, 32) << 17; - instrs[1] = main_instr; - num_instrs = 2; - done = stop; - break; - case 0x4: - /* Format 3 */ - instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; - instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; - consts.raw[0] = const0; - decode_M(&consts.mods[0], bits(words[2], 4, 8), bits(words[2], 8, 12), true); - num_instrs = 3; - num_consts = 1; - done = stop; - break; - case 0x1: - case 0x5: - /* Format 4 */ - instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; - instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; - main_instr.add_bits |= bits(words[3], 26, 29) << 17; - instrs[3] = main_instr; - if ((tag & 0x7) == 0x5) { - num_instrs = 4; - done = stop; - } - break; - case 0x6: - /* Format 8 */ - instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; - instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; - consts.raw[0] = const0; - decode_M(&consts.mods[0], bits(words[2], 4, 8), bits(words[2], 8, 12), true); - num_instrs = 6; - num_consts = 1; - done = stop; - break; - case 0x7: - /* Format 9 */ - instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; - instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; - main_instr.add_bits |= bits(words[3], 26, 29) << 17; - instrs[6] = main_instr; - num_instrs = 7; - done = stop; - break; - default: - unreachable("[INSTR_INVALID_ENC] Invalid tag bits"); - } - break; - case 0x2: - case 0x3: { - /* Format 6 or 11 */ - unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7; - main_instr.add_bits |= (tag & 0x7) << 17; - instrs[idx] = main_instr; - consts.raw[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19; - num_consts = 1; - num_instrs = idx + 1; - done = stop; - break; - } - case 0x4: { - /* Format 2 */ - unsigned idx = stop ? 4 : 1; - main_instr.add_bits |= (tag & 0x7) << 17; - instrs[idx] = main_instr; - instrs[idx + 1].fma_bits |= bits(words[3], 22, 32); - instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19)); - break; - } - case 0x1: - /* Format 0 - followed by constants */ - num_instrs = 1; - done = stop; - FALLTHROUGH; - case 0x5: - /* Format 0 - followed by instructions */ - header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19)); - main_instr.add_bits |= (tag & 0x7) << 17; - instrs[0] = main_instr; - break; - case 0x6: - case 0x7: { - /* Format 12 */ - unsigned pos = tag & 0xf; + if (verbose) { + fprintf(fp, "# tag: 0x%02x\n", tag); + } + if (tag & 0x80) { + /* Format 5 or 10 */ + unsigned idx = stop ? 5 : 2; + main_instr.add_bits |= ((tag >> 3) & 0x7) << 17; + instrs[idx + 1] = main_instr; + instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17); + instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10; + consts.raw[0] = bits(words[3], 17, 32) << 4; + } else { + bool done = false; + switch ((tag >> 3) & 0x7) { + case 0x0: + switch (tag & 0x7) { + case 0x3: + /* Format 1 */ + main_instr.add_bits |= bits(words[3], 29, 32) << 17; + instrs[1] = main_instr; + num_instrs = 2; + done = stop; + break; + case 0x4: + /* Format 3 */ + instrs[2].add_bits = + bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; + consts.raw[0] = const0; + decode_M(&consts.mods[0], bits(words[2], 4, 8), + bits(words[2], 8, 12), true); + num_instrs = 3; + num_consts = 1; + done = stop; + break; + case 0x1: + case 0x5: + /* Format 4 */ + instrs[2].add_bits = + bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; + main_instr.add_bits |= bits(words[3], 26, 29) << 17; + instrs[3] = main_instr; + if ((tag & 0x7) == 0x5) { + num_instrs = 4; + done = stop; + } + break; + case 0x6: + /* Format 8 */ + instrs[5].add_bits = + bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; + consts.raw[0] = const0; + decode_M(&consts.mods[0], bits(words[2], 4, 8), + bits(words[2], 8, 12), true); + num_instrs = 6; + num_consts = 1; + done = stop; + break; + case 0x7: + /* Format 9 */ + instrs[5].add_bits = + bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; + main_instr.add_bits |= bits(words[3], 26, 29) << 17; + instrs[6] = main_instr; + num_instrs = 7; + done = stop; + break; + default: + unreachable("[INSTR_INVALID_ENC] Invalid tag bits"); + } + break; + case 0x2: + case 0x3: { + /* Format 6 or 11 */ + unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7; + main_instr.add_bits |= (tag & 0x7) << 17; + instrs[idx] = main_instr; + consts.raw[0] |= + (bits(words[2], 19, 32) | ((uint64_t)words[3] << 13)) << 19; + num_consts = 1; + num_instrs = idx + 1; + done = stop; + break; + } + case 0x4: { + /* Format 2 */ + unsigned idx = stop ? 4 : 1; + main_instr.add_bits |= (tag & 0x7) << 17; + instrs[idx] = main_instr; + instrs[idx + 1].fma_bits |= bits(words[3], 22, 32); + instrs[idx + 1].reg_bits = + bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19)); + break; + } + case 0x1: + /* Format 0 - followed by constants */ + num_instrs = 1; + done = stop; + FALLTHROUGH; + case 0x5: + /* Format 0 - followed by instructions */ + header_bits = + bits(words[2], 19, 32) | ((uint64_t)words[3] << (32 - 19)); + main_instr.add_bits |= (tag & 0x7) << 17; + instrs[0] = main_instr; + break; + case 0x6: + case 0x7: { + /* Format 12 */ + unsigned pos = tag & 0xf; - struct { - unsigned const_idx; - unsigned nr_tuples; - } pos_table[0x10] = { - { 0, 1 }, - { 0, 2 }, - { 0, 4 }, - { 1, 3 }, - { 1, 5 }, - { 2, 4 }, - { 0, 7 }, - { 1, 6 }, - { 3, 5 }, - { 1, 8 }, - { 2, 7 }, - { 3, 6 }, - { 3, 8 }, - { 4, 7 }, - { 5, 6 }, - { ~0, ~0 } - }; + struct { + unsigned const_idx; + unsigned nr_tuples; + } pos_table[0x10] = {{0, 1}, {0, 2}, {0, 4}, {1, 3}, + {1, 5}, {2, 4}, {0, 7}, {1, 6}, + {3, 5}, {1, 8}, {2, 7}, {3, 6}, + {3, 8}, {4, 7}, {5, 6}, {~0, ~0}}; - ASSERTED bool valid_count = pos_table[pos].nr_tuples == num_instrs; - assert(valid_count && "INSTR_INVALID_ENC"); + ASSERTED bool valid_count = pos_table[pos].nr_tuples == num_instrs; + assert(valid_count && "INSTR_INVALID_ENC"); - unsigned const_idx = pos_table[pos].const_idx; + unsigned const_idx = pos_table[pos].const_idx; - if (num_consts < const_idx + 2) - num_consts = const_idx + 2; + if (num_consts < const_idx + 2) + num_consts = const_idx + 2; - consts.raw[const_idx] = const0; - consts.raw[const_idx + 1] = const1; + consts.raw[const_idx] = const0; + consts.raw[const_idx + 1] = const1; - /* Calculate M values from A, B and 4-bit - * unsigned arithmetic. Mathematically it - * should be (A - B) % 16 but we use this - * alternate form to avoid sign issues */ + /* Calculate M values from A, B and 4-bit + * unsigned arithmetic. Mathematically it + * should be (A - B) % 16 but we use this + * alternate form to avoid sign issues */ - unsigned A1 = bits(words[2], 0, 4); - unsigned B1 = bits(words[3], 28, 32); - unsigned A2 = bits(words[1], 0, 4); - unsigned B2 = bits(words[2], 28, 32); + unsigned A1 = bits(words[2], 0, 4); + unsigned B1 = bits(words[3], 28, 32); + unsigned A2 = bits(words[1], 0, 4); + unsigned B2 = bits(words[2], 28, 32); - unsigned M1 = (16 + A1 - B1) & 0xF; - unsigned M2 = (16 + A2 - B2) & 0xF; + unsigned M1 = (16 + A1 - B1) & 0xF; + unsigned M2 = (16 + A2 - B2) & 0xF; - decode_M(&consts.mods[const_idx], M1, M2, false); + decode_M(&consts.mods[const_idx], M1, M2, false); - done = stop; - break; - } - default: - break; - } + done = stop; + break; + } + default: + break; + } - if (done) - break; - } - } + if (done) + break; + } + } - *size = i + 1; + *size = i + 1; - if (verbose) { - fprintf(fp, "# header: %012" PRIx64 "\n", header_bits); - } + if (verbose) { + fprintf(fp, "# header: %012" PRIx64 "\n", header_bits); + } - struct bifrost_header header; - memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header)); - dump_header(fp, header, verbose); + struct bifrost_header header; + memcpy((char *)&header, (char *)&header_bits, sizeof(struct bifrost_header)); + dump_header(fp, header, verbose); - fprintf(fp, "{\n"); - for (i = 0; i < num_instrs; i++) { - struct bifrost_regs regs, next_regs; - if (i + 1 == num_instrs) { - memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits, - sizeof(next_regs)); - } else { - memcpy((char *) &next_regs, (char *) &instrs[i + 1].reg_bits, - sizeof(next_regs)); - } + fprintf(fp, "{\n"); + for (i = 0; i < num_instrs; i++) { + struct bifrost_regs regs, next_regs; + if (i + 1 == num_instrs) { + memcpy((char *)&next_regs, (char *)&instrs[0].reg_bits, + sizeof(next_regs)); + } else { + memcpy((char *)&next_regs, (char *)&instrs[i + 1].reg_bits, + sizeof(next_regs)); + } - memcpy((char *) ®s, (char *) &instrs[i].reg_bits, sizeof(regs)); + memcpy((char *)®s, (char *)&instrs[i].reg_bits, sizeof(regs)); - if (verbose) { - fprintf(fp, " # regs: %016" PRIx64 "\n", instrs[i].reg_bits); - dump_regs(fp, regs, i == 0); - } + if (verbose) { + fprintf(fp, " # regs: %016" PRIx64 "\n", instrs[i].reg_bits); + dump_regs(fp, regs, i == 0); + } - bi_disasm_fma(fp, instrs[i].fma_bits, ®s, &next_regs, - header.staging_register, offset, &consts, - i + 1 == num_instrs); + bi_disasm_fma(fp, instrs[i].fma_bits, ®s, &next_regs, + header.staging_register, offset, &consts, + i + 1 == num_instrs); - bi_disasm_add(fp, instrs[i].add_bits, ®s, &next_regs, - header.staging_register, offset, &consts, - i + 1 == num_instrs); - } - fprintf(fp, "}\n"); + bi_disasm_add(fp, instrs[i].add_bits, ®s, &next_regs, + header.staging_register, offset, &consts, + i + 1 == num_instrs); + } + fprintf(fp, "}\n"); - if (verbose) { - for (unsigned i = 0; i < num_consts; i++) { - fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i, consts.raw[i] & 0xffffffff); - fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i + 1, consts.raw[i] >> 32); - } - } + if (verbose) { + for (unsigned i = 0; i < num_consts; i++) { + fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i, + consts.raw[i] & 0xffffffff); + fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i + 1, + consts.raw[i] >> 32); + } + } - fprintf(fp, "\n"); - return; + fprintf(fp, "\n"); + return; } -void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose) +void +disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose) { - uint32_t *words = (uint32_t *) code; - uint32_t *words_end = words + (size / 4); - // used for displaying branch targets - unsigned offset = 0; - while (words != words_end) { - /* Shaders have zero bytes at the end for padding; stop - * disassembling when we hit them. */ - if (*words == 0) - break; + uint32_t *words = (uint32_t *)code; + uint32_t *words_end = words + (size / 4); + // used for displaying branch targets + unsigned offset = 0; + while (words != words_end) { + /* Shaders have zero bytes at the end for padding; stop + * disassembling when we hit them. */ + if (*words == 0) + break; - fprintf(fp, "clause_%u:\n", offset); + fprintf(fp, "clause_%u:\n", offset); - unsigned size; - dump_clause(fp, words, &size, offset, verbose); + unsigned size; + dump_clause(fp, words, &size, offset, verbose); - words += size * 4; - offset += size; - } + words += size * 4; + offset += size; + } } - diff --git a/src/panfrost/bifrost/disassemble.h b/src/panfrost/bifrost/disassemble.h index 1e39c20d658..bf023a732a4 100644 --- a/src/panfrost/bifrost/disassemble.h +++ b/src/panfrost/bifrost/disassemble.h @@ -34,14 +34,20 @@ void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose); -void -bi_disasm_fma(FILE *fp, unsigned bits, struct bifrost_regs *srcs, struct bifrost_regs *next_regs, unsigned staging_register, unsigned branch_offset, struct bi_constants *consts, bool first); +void bi_disasm_fma(FILE *fp, unsigned bits, struct bifrost_regs *srcs, + struct bifrost_regs *next_regs, unsigned staging_register, + unsigned branch_offset, struct bi_constants *consts, + bool first); -void bi_disasm_add(FILE *fp, unsigned bits, struct bifrost_regs *srcs, struct bifrost_regs *next_regs, unsigned staging_register, unsigned branch_offset, struct bi_constants *consts, bool first); +void bi_disasm_add(FILE *fp, unsigned bits, struct bifrost_regs *srcs, + struct bifrost_regs *next_regs, unsigned staging_register, + unsigned branch_offset, struct bi_constants *consts, + bool first); void bi_disasm_dest_fma(FILE *fp, struct bifrost_regs *next_regs, bool first); void bi_disasm_dest_add(FILE *fp, struct bifrost_regs *next_regs, bool first); -void dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, unsigned branch_offset, struct bi_constants *consts, bool isFMA); +void dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, + unsigned branch_offset, struct bi_constants *consts, bool isFMA); #endif diff --git a/src/panfrost/bifrost/nodearray.h b/src/panfrost/bifrost/nodearray.h index ed852e0c56d..40ca43def60 100644 --- a/src/panfrost/bifrost/nodearray.h +++ b/src/panfrost/bifrost/nodearray.h @@ -62,182 +62,187 @@ typedef uint16_t nodearray_value; typedef uint64_t nodearray_sparse; typedef struct { - union { - nodearray_sparse *sparse; - nodearray_value *dense; - }; - unsigned size; - unsigned sparse_capacity; + union { + nodearray_sparse *sparse; + nodearray_value *dense; + }; + unsigned size; + unsigned sparse_capacity; } nodearray; /* Align sizes to 16-bytes for SIMD purposes */ #define NODEARRAY_DENSE_ALIGN(x) ALIGN_POT(x, 16) -#define nodearray_sparse_foreach(buf, elem) \ - for (nodearray_sparse *elem = (buf)->sparse; \ +#define nodearray_sparse_foreach(buf, elem) \ + for (nodearray_sparse *elem = (buf)->sparse; \ elem < (buf)->sparse + (buf)->size; elem++) -#define nodearray_dense_foreach(buf, elem) \ - for (nodearray_value *elem = (buf)->dense; \ +#define nodearray_dense_foreach(buf, elem) \ + for (nodearray_value *elem = (buf)->dense; \ elem < (buf)->dense + (buf)->size; elem++) -#define nodearray_dense_foreach_64(buf, elem) \ - for (uint64_t *elem = (uint64_t *)(buf)->dense; \ +#define nodearray_dense_foreach_64(buf, elem) \ + for (uint64_t *elem = (uint64_t *)(buf)->dense; \ (nodearray_value *)elem < (buf)->dense + (buf)->size; elem++) static inline bool nodearray_is_sparse(const nodearray *a) { - return a->sparse_capacity != ~0U; + return a->sparse_capacity != ~0U; } static inline void nodearray_init(nodearray *a) { - memset(a, 0, sizeof(nodearray)); + memset(a, 0, sizeof(nodearray)); } static inline void nodearray_reset(nodearray *a) { - free(a->sparse); - nodearray_init(a); + free(a->sparse); + nodearray_init(a); } static inline nodearray_sparse nodearray_encode(unsigned key, nodearray_value value) { - static_assert(sizeof(nodearray_value) == sizeof(uint16_t), "sizes mismatch"); - return ((nodearray_sparse) key << 16) | value; + static_assert(sizeof(nodearray_value) == sizeof(uint16_t), "sizes mismatch"); + return ((nodearray_sparse)key << 16) | value; } static inline unsigned nodearray_sparse_key(const nodearray_sparse *elem) { - static_assert(sizeof(nodearray_value) == sizeof(uint16_t), "sizes mismatch"); - return *elem >> 16; + static_assert(sizeof(nodearray_value) == sizeof(uint16_t), "sizes mismatch"); + return *elem >> 16; } static inline nodearray_value nodearray_sparse_value(const nodearray_sparse *elem) { - return *elem & NODEARRAY_MAX_VALUE; + return *elem & NODEARRAY_MAX_VALUE; } static inline unsigned -nodearray_sparse_search(const nodearray *a, nodearray_sparse key, nodearray_sparse **elem) +nodearray_sparse_search(const nodearray *a, nodearray_sparse key, + nodearray_sparse **elem) { - assert(nodearray_is_sparse(a) && a->size); + assert(nodearray_is_sparse(a) && a->size); - nodearray_sparse *data = a->sparse; + nodearray_sparse *data = a->sparse; - /* Encode the key using the highest possible value, so that the - * matching node must be encoded lower than this - */ - nodearray_sparse skey = nodearray_encode(key, NODEARRAY_MAX_VALUE); + /* Encode the key using the highest possible value, so that the + * matching node must be encoded lower than this + */ + nodearray_sparse skey = nodearray_encode(key, NODEARRAY_MAX_VALUE); - unsigned left = 0; - unsigned right = a->size - 1; + unsigned left = 0; + unsigned right = a->size - 1; - if (data[right] <= skey) - left = right; + if (data[right] <= skey) + left = right; - while (left != right) { - /* No need to worry about overflow, we couldn't have more than - * 2^24 elements */ - unsigned probe = (left + right + 1) / 2; + while (left != right) { + /* No need to worry about overflow, we couldn't have more than + * 2^24 elements */ + unsigned probe = (left + right + 1) / 2; - if (data[probe] > skey) - right = probe - 1; - else - left = probe; - } + if (data[probe] > skey) + right = probe - 1; + else + left = probe; + } - *elem = data + left; - return left; + *elem = data + left; + return left; } static inline void nodearray_orr(nodearray *a, unsigned key, nodearray_value value, unsigned max_sparse, unsigned max) { - assert(key < (1 << 24)); - assert(key < max); + assert(key < (1 << 24)); + assert(key < max); - if (!value) - return; + if (!value) + return; - if (nodearray_is_sparse(a)) { - unsigned size = a->size; - unsigned left = 0; + if (nodearray_is_sparse(a)) { + unsigned size = a->size; + unsigned left = 0; - if (size) { - /* First, binary search for key */ - nodearray_sparse *elem; - left = nodearray_sparse_search(a, key, &elem); + if (size) { + /* First, binary search for key */ + nodearray_sparse *elem; + left = nodearray_sparse_search(a, key, &elem); - if (nodearray_sparse_key(elem) == key) { - *elem |= value; - return; - } + if (nodearray_sparse_key(elem) == key) { + *elem |= value; + return; + } - /* We insert before `left`, so increment it if it's - * out of order */ - if (nodearray_sparse_key(elem) < key) - ++left; - } + /* We insert before `left`, so increment it if it's + * out of order */ + if (nodearray_sparse_key(elem) < key) + ++left; + } - if (size < max_sparse && (size + 1) < max / 4) { - /* We didn't find it, but we know where to insert it. */ + if (size < max_sparse && (size + 1) < max / 4) { + /* We didn't find it, but we know where to insert it. */ - nodearray_sparse *data = a->sparse; - nodearray_sparse *data_move = data + left; + nodearray_sparse *data = a->sparse; + nodearray_sparse *data_move = data + left; - bool realloc = (++a->size) > a->sparse_capacity; + bool realloc = (++a->size) > a->sparse_capacity; - if (realloc) { - a->sparse_capacity = MIN2(MAX2(a->sparse_capacity * 2, 64), max / 4); + if (realloc) { + a->sparse_capacity = + MIN2(MAX2(a->sparse_capacity * 2, 64), max / 4); - a->sparse = (nodearray_sparse *)malloc(a->sparse_capacity * sizeof(nodearray_sparse)); + a->sparse = (nodearray_sparse *)malloc(a->sparse_capacity * + sizeof(nodearray_sparse)); - if (left) - memcpy(a->sparse, data, left * sizeof(nodearray_sparse)); - } + if (left) + memcpy(a->sparse, data, left * sizeof(nodearray_sparse)); + } - nodearray_sparse *elem = a->sparse + left; + nodearray_sparse *elem = a->sparse + left; - if (left != size) - memmove(elem + 1, data_move, (size - left) * sizeof(nodearray_sparse)); + if (left != size) + memmove(elem + 1, data_move, + (size - left) * sizeof(nodearray_sparse)); - *elem = nodearray_encode(key, value); + *elem = nodearray_encode(key, value); - if (realloc) - free(data); + if (realloc) + free(data); - return; - } + return; + } - /* There are too many elements, so convert to a dense array */ - nodearray old = *a; + /* There are too many elements, so convert to a dense array */ + nodearray old = *a; - a->dense = (nodearray_value *)calloc(NODEARRAY_DENSE_ALIGN(max), sizeof(nodearray_value)); - a->size = max; - a->sparse_capacity = ~0U; + a->dense = (nodearray_value *)calloc(NODEARRAY_DENSE_ALIGN(max), + sizeof(nodearray_value)); + a->size = max; + a->sparse_capacity = ~0U; - nodearray_value *data = a->dense; + nodearray_value *data = a->dense; - nodearray_sparse_foreach(&old, x) { - unsigned key = nodearray_sparse_key(x); - nodearray_value value = nodearray_sparse_value(x); + nodearray_sparse_foreach(&old, x) { + unsigned key = nodearray_sparse_key(x); + nodearray_value value = nodearray_sparse_value(x); - assert(key < max); - data[key] = value; - } + assert(key < max); + data[key] = value; + } - free(old.sparse); - } + free(old.sparse); + } - a->dense[key] |= value; + a->dense[key] |= value; } #ifdef __cplusplus diff --git a/src/panfrost/bifrost/test/test-constant-fold.cpp b/src/panfrost/bifrost/test/test-constant-fold.cpp index 90a63862ee1..1e7034f6864 100644 --- a/src/panfrost/bifrost/test/test-constant-fold.cpp +++ b/src/panfrost/bifrost/test/test-constant-fold.cpp @@ -21,14 +21,15 @@ * SOFTWARE. */ -#include "compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "compiler.h" #include static std::string -to_string(const bi_instr *I) { +to_string(const bi_instr *I) +{ char *cstr = NULL; size_t size = 0; FILE *f = open_memstream(&cstr, &size); @@ -40,23 +41,21 @@ to_string(const bi_instr *I) { } static testing::AssertionResult -constant_fold_pred(const char *I_expr, - const char *expected_expr, - bi_instr *I, +constant_fold_pred(const char *I_expr, const char *expected_expr, bi_instr *I, uint32_t expected) { bool unsupported = false; uint32_t v = bi_fold_constant(I, &unsupported); if (unsupported) { return testing::AssertionFailure() - << "Constant fold unsupported for instruction \n\n" - << " " << to_string(I); + << "Constant fold unsupported for instruction \n\n" + << " " << to_string(I); } else if (v != expected) { return testing::AssertionFailure() - << "Unexpected result when constant folding instruction\n\n" - << " " << to_string(I) << "\n" - << " Actual: " << v << "\n" - << "Expected: " << expected << "\n"; + << "Unexpected result when constant folding instruction\n\n" + << " " << to_string(I) << "\n" + << " Actual: " << v << "\n" + << "Expected: " << expected << "\n"; } else { return testing::AssertionSuccess(); } @@ -64,7 +63,6 @@ constant_fold_pred(const char *I_expr, #define EXPECT_FOLD(i, e) EXPECT_PRED_FORMAT2(constant_fold_pred, i, e) - static testing::AssertionResult not_constant_fold_pred(const char *I_expr, bi_instr *I) { @@ -74,22 +72,23 @@ not_constant_fold_pred(const char *I_expr, bi_instr *I) return testing::AssertionSuccess(); } else { return testing::AssertionFailure() - << "Instruction\n\n" - << " " << to_string(I) << "\n" - << "shouldn't have constant folded, but folded to: " << v; + << "Instruction\n\n" + << " " << to_string(I) << "\n" + << "shouldn't have constant folded, but folded to: " << v; } } #define EXPECT_NOT_FOLD(i) EXPECT_PRED_FORMAT1(not_constant_fold_pred, i) - class ConstantFold : public testing::Test { -protected: - ConstantFold() { + protected: + ConstantFold() + { mem_ctx = ralloc_context(NULL); b = bit_builder(mem_ctx); } - ~ConstantFold() { + ~ConstantFold() + { ralloc_free(mem_ctx); } @@ -101,9 +100,7 @@ TEST_F(ConstantFold, Swizzles) { bi_index reg = bi_register(0); - EXPECT_FOLD( - bi_swz_v2i16_to(b, reg, bi_imm_u32(0xCAFEBABE)), - 0xCAFEBABE); + EXPECT_FOLD(bi_swz_v2i16_to(b, reg, bi_imm_u32(0xCAFEBABE)), 0xCAFEBABE); EXPECT_FOLD( bi_swz_v2i16_to(b, reg, bi_swz_16(bi_imm_u32(0xCAFEBABE), false, false)), @@ -123,18 +120,17 @@ TEST_F(ConstantFold, VectorConstructions2i16) bi_index reg = bi_register(0); EXPECT_FOLD( - bi_mkvec_v2i16_to(b, reg, bi_imm_u16(0xCAFE), - bi_imm_u16(0xBABE)), + bi_mkvec_v2i16_to(b, reg, bi_imm_u16(0xCAFE), bi_imm_u16(0xBABE)), 0xBABECAFE); EXPECT_FOLD( bi_mkvec_v2i16_to(b, reg, bi_swz_16(bi_imm_u32(0xCAFEBABE), true, true), - bi_imm_u16(0xBABE)), + bi_imm_u16(0xBABE)), 0xBABECAFE); EXPECT_FOLD( bi_mkvec_v2i16_to(b, reg, bi_swz_16(bi_imm_u32(0xCAFEBABE), true, true), - bi_swz_16(bi_imm_u32(0xCAFEBABE), false, false)), + bi_swz_16(bi_imm_u32(0xCAFEBABE), false, false)), 0xBABECAFE); } @@ -173,17 +169,18 @@ TEST_F(ConstantFold, LimitedShiftsForTexturing) { bi_index reg = bi_register(0); - EXPECT_FOLD( - bi_lshift_or_i32_to(b, reg, bi_imm_u32(0xCAFE), bi_imm_u32(0xA0000), bi_imm_u8(4)), - (0xCAFE << 4) | 0xA0000); + EXPECT_FOLD(bi_lshift_or_i32_to(b, reg, bi_imm_u32(0xCAFE), + bi_imm_u32(0xA0000), bi_imm_u8(4)), + (0xCAFE << 4) | 0xA0000); - EXPECT_NOT_FOLD( - bi_lshift_or_i32_to(b, reg, bi_imm_u32(0xCAFE), bi_not(bi_imm_u32(0xA0000)), bi_imm_u8(4))); + EXPECT_NOT_FOLD(bi_lshift_or_i32_to( + b, reg, bi_imm_u32(0xCAFE), bi_not(bi_imm_u32(0xA0000)), bi_imm_u8(4))); - EXPECT_NOT_FOLD( - bi_lshift_or_i32_to(b, reg, bi_not(bi_imm_u32(0xCAFE)), bi_imm_u32(0xA0000), bi_imm_u8(4))); + EXPECT_NOT_FOLD(bi_lshift_or_i32_to(b, reg, bi_not(bi_imm_u32(0xCAFE)), + bi_imm_u32(0xA0000), bi_imm_u8(4))); - bi_instr *I = bi_lshift_or_i32_to(b, reg, bi_imm_u32(0xCAFE), bi_imm_u32(0xA0000), bi_imm_u8(4)); + bi_instr *I = bi_lshift_or_i32_to(b, reg, bi_imm_u32(0xCAFE), + bi_imm_u32(0xA0000), bi_imm_u8(4)); I->not_result = true; EXPECT_NOT_FOLD(I); } @@ -193,9 +190,12 @@ TEST_F(ConstantFold, NonConstantSourcesCannotBeFolded) bi_index reg = bi_register(0); EXPECT_NOT_FOLD(bi_swz_v2i16_to(b, reg, bi_temp(b->shader))); - EXPECT_NOT_FOLD(bi_mkvec_v2i16_to(b, reg, bi_temp(b->shader), bi_temp(b->shader))); - EXPECT_NOT_FOLD(bi_mkvec_v2i16_to(b, reg, bi_temp(b->shader), bi_imm_u32(0xDEADBEEF))); - EXPECT_NOT_FOLD(bi_mkvec_v2i16_to(b, reg, bi_imm_u32(0xDEADBEEF), bi_temp(b->shader))); + EXPECT_NOT_FOLD( + bi_mkvec_v2i16_to(b, reg, bi_temp(b->shader), bi_temp(b->shader))); + EXPECT_NOT_FOLD( + bi_mkvec_v2i16_to(b, reg, bi_temp(b->shader), bi_imm_u32(0xDEADBEEF))); + EXPECT_NOT_FOLD( + bi_mkvec_v2i16_to(b, reg, bi_imm_u32(0xDEADBEEF), bi_temp(b->shader))); } TEST_F(ConstantFold, OtherOperationsShouldNotFold) diff --git a/src/panfrost/bifrost/test/test-dual-texture.cpp b/src/panfrost/bifrost/test/test-dual-texture.cpp index aa364aa5bcf..25f22e02889 100644 --- a/src/panfrost/bifrost/test/test-dual-texture.cpp +++ b/src/panfrost/bifrost/test/test-dual-texture.cpp @@ -21,55 +21,57 @@ * SOFTWARE. */ -#include "compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "compiler.h" #include -#define CASE(shader_stage, instr, expected) do { \ - bi_builder *A = bit_builder(mem_ctx); \ - bi_builder *B = bit_builder(mem_ctx); \ - { \ - bi_builder *b = A; \ - bi_index u = bi_temp(b->shader); \ - bi_index v = bi_temp(b->shader); \ - A->shader->stage = MESA_SHADER_ ## shader_stage; \ - instr; \ - } \ - { \ - bi_builder *b = B; \ - bi_index u = bi_temp(b->shader); \ - bi_index v = bi_temp(b->shader); \ - B->shader->stage = MESA_SHADER_ ## shader_stage; \ - expected; \ - } \ - bi_opt_fuse_dual_texture(A->shader); \ - if (!bit_shader_equal(A->shader, B->shader)) { \ - ADD_FAILURE(); \ - fprintf(stderr, "Optimization produce unexpected result"); \ - fprintf(stderr, " Actual:\n"); \ - bi_print_shader(A->shader, stderr); \ - fprintf(stderr, "Expected:\n"); \ - bi_print_shader(B->shader, stderr); \ - fprintf(stderr, "\n"); \ - } \ -} while(0) +#define CASE(shader_stage, instr, expected) \ + do { \ + bi_builder *A = bit_builder(mem_ctx); \ + bi_builder *B = bit_builder(mem_ctx); \ + { \ + bi_builder *b = A; \ + bi_index u = bi_temp(b->shader); \ + bi_index v = bi_temp(b->shader); \ + A->shader->stage = MESA_SHADER_##shader_stage; \ + instr; \ + } \ + { \ + bi_builder *b = B; \ + bi_index u = bi_temp(b->shader); \ + bi_index v = bi_temp(b->shader); \ + B->shader->stage = MESA_SHADER_##shader_stage; \ + expected; \ + } \ + bi_opt_fuse_dual_texture(A->shader); \ + if (!bit_shader_equal(A->shader, B->shader)) { \ + ADD_FAILURE(); \ + fprintf(stderr, "Optimization produce unexpected result"); \ + fprintf(stderr, " Actual:\n"); \ + bi_print_shader(A->shader, stderr); \ + fprintf(stderr, "Expected:\n"); \ + bi_print_shader(B->shader, stderr); \ + fprintf(stderr, "\n"); \ + } \ + } while (0) #define NEGCASE(stage, instr) CASE(stage, instr, instr) class DualTexture : public testing::Test { -protected: - DualTexture() { + protected: + DualTexture() + { mem_ctx = ralloc_context(NULL); - reg = bi_register(0); - x = bi_register(4); - y = bi_register(8); - + reg = bi_register(0); + x = bi_register(4); + y = bi_register(8); } - ~DualTexture() { + ~DualTexture() + { ralloc_free(mem_ctx); } @@ -78,134 +80,165 @@ protected: bi_index reg, x, y; }; - TEST_F(DualTexture, FuseDualTexFragment) { - CASE(FRAGMENT, { + CASE( + FRAGMENT, + { bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); - }, { - bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F00144), false, 4, 4); - }); + }, + { + bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F00144), + false, 4, 4); + }); } TEST_F(DualTexture, FuseDualTexKernel) { - CASE(KERNEL, { + CASE( + KERNEL, + { bi_texs_2d_f32_to(b, x, u, v, true, 0, 0); bi_texs_2d_f32_to(b, y, u, v, true, 1, 1); - }, { - bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F00144), true, 4, 4); - }); + }, + { + bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F00144), true, + 4, 4); + }); } TEST_F(DualTexture, FuseDualTexVertex) { - CASE(VERTEX, { + CASE( + VERTEX, + { bi_texs_2d_f32_to(b, x, u, v, true, 0, 0); bi_texs_2d_f32_to(b, y, u, v, true, 1, 1); - }, { - bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F00144), true, 4, 4); - }); + }, + { + bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F00144), true, + 4, 4); + }); } TEST_F(DualTexture, DontFuseDualTexWrongStage) { NEGCASE(FRAGMENT, { - bi_texs_2d_f32_to(b, x, u, v, true, 0, 0); - bi_texs_2d_f32_to(b, y, u, v, true, 1, 1); + bi_texs_2d_f32_to(b, x, u, v, true, 0, 0); + bi_texs_2d_f32_to(b, y, u, v, true, 1, 1); }); NEGCASE(KERNEL, { - bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); - bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); + bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); + bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); }); NEGCASE(VERTEX, { - bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); - bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); + bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); + bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); }); } TEST_F(DualTexture, FuseDualTexMaximumIndex) { - CASE(FRAGMENT, { + CASE( + FRAGMENT, + { bi_texs_2d_f32_to(b, x, u, v, false, 2, 2); bi_texs_2d_f32_to(b, y, u, v, false, 3, 3); - }, { - bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F003E6), false, 4, 4); - }); + }, + { + bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F003E6), + false, 4, 4); + }); } TEST_F(DualTexture, FuseDualTexMixedIndex) { - CASE(FRAGMENT, { + CASE( + FRAGMENT, + { bi_texs_2d_f32_to(b, x, u, v, false, 3, 2); bi_texs_2d_f32_to(b, y, u, v, false, 2, 3); - }, { - bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F003A7), false, 4, 4); - }); + }, + { + bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF9F003A7), + false, 4, 4); + }); } TEST_F(DualTexture, DontFuseDualTexOutOfBounds) { NEGCASE(FRAGMENT, { - bi_texs_2d_f32_to(b, x, u, v, false, 4, 0); - bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); + bi_texs_2d_f32_to(b, x, u, v, false, 4, 0); + bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); }); NEGCASE(FRAGMENT, { - bi_texs_2d_f32_to(b, x, u, v, false, 0, 4); - bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); + bi_texs_2d_f32_to(b, x, u, v, false, 0, 4); + bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); }); NEGCASE(FRAGMENT, { - bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); - bi_texs_2d_f32_to(b, y, u, v, false, 4, 1); + bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); + bi_texs_2d_f32_to(b, y, u, v, false, 4, 1); }); NEGCASE(FRAGMENT, { - bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); - bi_texs_2d_f32_to(b, y, u, v, false, 1, 4); + bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); + bi_texs_2d_f32_to(b, y, u, v, false, 1, 4); }); } TEST_F(DualTexture, FuseDualTexFP16) { - CASE(FRAGMENT, { + CASE( + FRAGMENT, + { bi_texs_2d_f16_to(b, x, u, v, false, 0, 0); bi_texs_2d_f16_to(b, y, u, v, false, 1, 1); - }, { - bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF1E00144), false, 2, 2); - }); + }, + { + bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF1E00144), + false, 2, 2); + }); } TEST_F(DualTexture, FuseDualTexMixedSize) { - CASE(FRAGMENT, { + CASE( + FRAGMENT, + { bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); bi_texs_2d_f16_to(b, y, u, v, false, 1, 1); - }, { - bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0XF9E00144), false, 4, 2); - }); + }, + { + bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0XF9E00144), + false, 4, 2); + }); - CASE(FRAGMENT, { + CASE( + FRAGMENT, + { bi_texs_2d_f16_to(b, x, u, v, false, 0, 0); bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); - }, { - bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF1F00144), false, 2, 4); - }); + }, + { + bi_texc_dual_to(b, x, y, bi_null(), u, v, bi_imm_u32(0xF1F00144), + false, 2, 4); + }); } TEST_F(DualTexture, DontFuseMixedCoordinates) { NEGCASE(FRAGMENT, { - bi_texs_2d_f32_to(b, x, bi_neg(u), v, false, 0, 0); - bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); + bi_texs_2d_f32_to(b, x, bi_neg(u), v, false, 0, 0); + bi_texs_2d_f32_to(b, y, u, v, false, 1, 1); }); NEGCASE(FRAGMENT, { - bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); - bi_texs_2d_f32_to(b, y, v, u, false, 1, 1); + bi_texs_2d_f32_to(b, x, u, v, false, 0, 0); + bi_texs_2d_f32_to(b, y, v, u, false, 1, 1); }); } diff --git a/src/panfrost/bifrost/test/test-lower-swizzle.cpp b/src/panfrost/bifrost/test/test-lower-swizzle.cpp index af36dfc1de0..a6a35554974 100644 --- a/src/panfrost/bifrost/test/test-lower-swizzle.cpp +++ b/src/panfrost/bifrost/test/test-lower-swizzle.cpp @@ -21,31 +21,34 @@ * SOFTWARE. */ -#include "compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "compiler.h" #include -#define CASE(instr, expected) INSTRUCTION_CASE(instr, expected, bi_lower_swizzle) +#define CASE(instr, expected) \ + INSTRUCTION_CASE(instr, expected, bi_lower_swizzle) #define NEGCASE(instr) CASE(instr, instr) class LowerSwizzle : public testing::Test { -protected: - LowerSwizzle() { + protected: + LowerSwizzle() + { mem_ctx = ralloc_context(NULL); - reg = bi_register(0); - x = bi_register(1); - y = bi_register(2); - z = bi_register(3); - w = bi_register(4); + reg = bi_register(0); + x = bi_register(1); + y = bi_register(2); + z = bi_register(3); + w = bi_register(4); - x3210 = x; + x3210 = x; x3210.swizzle = BI_SWIZZLE_B3210; } - ~LowerSwizzle() { + ~LowerSwizzle() + { ralloc_free(mem_ctx); } @@ -58,7 +61,8 @@ protected: TEST_F(LowerSwizzle, Csel16) { CASE(bi_csel_v2f16_to(b, reg, bi_half(x, 0), y, z, w, BI_CMPF_NE), - bi_csel_v2f16_to(b, reg, bi_swz_v2i16(b, bi_half(x, 0)), y, z, w, BI_CMPF_NE)); + bi_csel_v2f16_to(b, reg, bi_swz_v2i16(b, bi_half(x, 0)), y, z, w, + BI_CMPF_NE)); } TEST_F(LowerSwizzle, Fma16) @@ -79,23 +83,22 @@ TEST_F(LowerSwizzle, ClzHadd8) TEST_F(LowerSwizzle, FirstShift8) { enum bi_opcode ops[] = { - BI_OPCODE_LSHIFT_AND_V4I8, - BI_OPCODE_LSHIFT_OR_V4I8, - BI_OPCODE_LSHIFT_XOR_V4I8, - BI_OPCODE_RSHIFT_AND_V4I8, - BI_OPCODE_RSHIFT_OR_V4I8, - BI_OPCODE_RSHIFT_XOR_V4I8, + BI_OPCODE_LSHIFT_AND_V4I8, BI_OPCODE_LSHIFT_OR_V4I8, + BI_OPCODE_LSHIFT_XOR_V4I8, BI_OPCODE_RSHIFT_AND_V4I8, + BI_OPCODE_RSHIFT_OR_V4I8, BI_OPCODE_RSHIFT_XOR_V4I8, }; for (unsigned i = 0; i < ARRAY_SIZE(ops); ++i) { - CASE({ + CASE( + { bi_instr *I = bi_lshift_and_v4i8_to(b, reg, x3210, y, z); I->op = ops[i]; - }, - { - bi_instr *I = bi_lshift_and_v4i8_to(b, reg, bi_swz_v4i8(b, x3210), y, z); + }, + { + bi_instr *I = + bi_lshift_and_v4i8_to(b, reg, bi_swz_v4i8(b, x3210), y, z); I->op = ops[i]; - }); + }); } } diff --git a/src/panfrost/bifrost/test/test-message-preload.cpp b/src/panfrost/bifrost/test/test-message-preload.cpp index f1f00413d00..d5e548d54f8 100644 --- a/src/panfrost/bifrost/test/test-message-preload.cpp +++ b/src/panfrost/bifrost/test/test-message-preload.cpp @@ -21,56 +21,58 @@ * SOFTWARE. */ -#include "compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "compiler.h" #include -#define CASE(instr, expected) do { \ - bi_builder *A = bit_builder(mem_ctx); \ - bi_builder *B = bit_builder(mem_ctx); \ - A->shader->info.bifrost = rzalloc(mem_ctx, struct bifrost_shader_info); \ - B->shader->info.bifrost = rzalloc(mem_ctx, struct bifrost_shader_info); \ - { \ - bi_builder *b = A; \ - bi_index u = bi_temp(b->shader); \ - UNUSED bi_index v = bi_temp(b->shader); \ - UNUSED bi_index w = bi_temp(b->shader); \ - instr; \ - } \ - { \ - bi_builder *b = B; \ - bi_index u = bi_temp(b->shader); \ - UNUSED bi_index v = bi_temp(b->shader); \ - UNUSED bi_index w = bi_temp(b->shader); \ - expected; \ - } \ - bi_opt_message_preload(A->shader); \ - if (!bit_shader_equal(A->shader, B->shader)) { \ - ADD_FAILURE(); \ - fprintf(stderr, "Optimization produce unexpected result"); \ - fprintf(stderr, " Actual:\n"); \ - bi_print_shader(A->shader, stderr); \ - fprintf(stderr, "Expected:\n"); \ - bi_print_shader(B->shader, stderr); \ - fprintf(stderr, "\n"); \ - } \ -} while(0) +#define CASE(instr, expected) \ + do { \ + bi_builder *A = bit_builder(mem_ctx); \ + bi_builder *B = bit_builder(mem_ctx); \ + A->shader->info.bifrost = rzalloc(mem_ctx, struct bifrost_shader_info); \ + B->shader->info.bifrost = rzalloc(mem_ctx, struct bifrost_shader_info); \ + { \ + bi_builder *b = A; \ + bi_index u = bi_temp(b->shader); \ + UNUSED bi_index v = bi_temp(b->shader); \ + UNUSED bi_index w = bi_temp(b->shader); \ + instr; \ + } \ + { \ + bi_builder *b = B; \ + bi_index u = bi_temp(b->shader); \ + UNUSED bi_index v = bi_temp(b->shader); \ + UNUSED bi_index w = bi_temp(b->shader); \ + expected; \ + } \ + bi_opt_message_preload(A->shader); \ + if (!bit_shader_equal(A->shader, B->shader)) { \ + ADD_FAILURE(); \ + fprintf(stderr, "Optimization produce unexpected result"); \ + fprintf(stderr, " Actual:\n"); \ + bi_print_shader(A->shader, stderr); \ + fprintf(stderr, "Expected:\n"); \ + bi_print_shader(B->shader, stderr); \ + fprintf(stderr, "\n"); \ + } \ + } while (0) #define NEGCASE(instr) CASE(instr, instr) class MessagePreload : public testing::Test { -protected: - MessagePreload() { + protected: + MessagePreload() + { mem_ctx = ralloc_context(NULL); - x = bi_register(16); - y = bi_register(32); - + x = bi_register(16); + y = bi_register(32); } - ~MessagePreload() { + ~MessagePreload() + { ralloc_free(mem_ctx); } @@ -84,100 +86,117 @@ protected: b->cursor = bi_before_block(bi_start_block(&b->shader->blocks)); bi_foreach_src(I, i) - I->src[i] = bi_mov_i32(b, bi_register(idx*4 + i)); + I->src[i] = bi_mov_i32(b, bi_register(idx * 4 + i)); b->cursor = bi_after_instr(I); } }; - TEST_F(MessagePreload, PreloadLdVarSample) { - CASE({ + CASE( + { bi_ld_var_imm_to(b, u, bi_register(61), BI_REGISTER_FORMAT_F32, BI_SAMPLE_SAMPLE, BI_UPDATE_STORE, BI_VECSIZE_V4, 0); - }, { - preload_moves(b, u, 4, 0); - }); + }, + { preload_moves(b, u, 4, 0); }); } TEST_F(MessagePreload, PreloadLdVarLdVar) { - CASE({ + CASE( + { bi_ld_var_imm_to(b, u, bi_register(61), BI_REGISTER_FORMAT_F32, BI_SAMPLE_SAMPLE, BI_UPDATE_STORE, BI_VECSIZE_V4, 2); bi_ld_var_imm_to(b, v, bi_register(61), BI_REGISTER_FORMAT_F32, BI_SAMPLE_SAMPLE, BI_UPDATE_STORE, BI_VECSIZE_V4, 1); - }, { + }, + { preload_moves(b, u, 4, 0); preload_moves(b, v, 4, 1); - }); + }); } TEST_F(MessagePreload, MaxTwoMessages) { - CASE({ + CASE( + { bi_ld_var_imm_to(b, u, bi_register(61), BI_REGISTER_FORMAT_F32, BI_SAMPLE_SAMPLE, BI_UPDATE_STORE, BI_VECSIZE_V4, 2); bi_ld_var_imm_to(b, v, bi_register(61), BI_REGISTER_FORMAT_F32, BI_SAMPLE_SAMPLE, BI_UPDATE_STORE, BI_VECSIZE_V4, 1); bi_ld_var_imm_to(b, w, bi_register(61), BI_REGISTER_FORMAT_F32, BI_SAMPLE_SAMPLE, BI_UPDATE_STORE, BI_VECSIZE_V4, 0); - }, - { + }, + { preload_moves(b, u, 4, 0); preload_moves(b, v, 4, 1); bi_ld_var_imm_to(b, w, bi_register(61), BI_REGISTER_FORMAT_F32, BI_SAMPLE_SAMPLE, BI_UPDATE_STORE, BI_VECSIZE_V4, 0); - }); + }); - CASE({ - bi_var_tex_f32_to(b, u, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, 0); - bi_var_tex_f16_to(b, v, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 1, 2); - bi_var_tex_f16_to(b, w, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 3, 3); - }, { + CASE( + { + bi_var_tex_f32_to(b, u, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, + 0); + bi_var_tex_f16_to(b, v, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 1, + 2); + bi_var_tex_f16_to(b, w, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 3, + 3); + }, + { preload_moves(b, u, 4, 0); preload_moves(b, v, 2, 1); - bi_var_tex_f16_to(b, w, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 3, 3); - }); + bi_var_tex_f16_to(b, w, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 3, + 3); + }); } TEST_F(MessagePreload, PreloadVartexF16) { - CASE({ - bi_var_tex_f16_to(b, u, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, 0); - }, { - preload_moves(b, u, 2, 0); - }); + CASE( + { + bi_var_tex_f16_to(b, u, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, + 0); + }, + { preload_moves(b, u, 2, 0); }); } TEST_F(MessagePreload, PreloadVartexF32) { - CASE({ - bi_var_tex_f32_to(b, u, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, 0); - }, { - preload_moves(b, u, 4, 0); - }); + CASE( + { + bi_var_tex_f32_to(b, u, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, + 0); + }, + { preload_moves(b, u, 4, 0); }); } TEST_F(MessagePreload, PreloadVartexF32VartexF16) { - CASE({ - bi_var_tex_f32_to(b, u, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, 0); - bi_var_tex_f16_to(b, v, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 1, 2); - }, { + CASE( + { + bi_var_tex_f32_to(b, u, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, + 0); + bi_var_tex_f16_to(b, v, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 1, + 2); + }, + { preload_moves(b, u, 4, 0); preload_moves(b, v, 2, 1); - }); + }); } TEST_F(MessagePreload, PreloadVartexLodModes) { - CASE({ + CASE( + { bi_var_tex_f32_to(b, u, true, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, 0); - bi_var_tex_f32_to(b, v, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, 0); - }, { + bi_var_tex_f32_to(b, v, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, + 0); + }, + { preload_moves(b, u, 4, 0); preload_moves(b, v, 4, 1); - }); + }); } diff --git a/src/panfrost/bifrost/test/test-optimizer.cpp b/src/panfrost/bifrost/test/test-optimizer.cpp index 73be5367159..c10b9367e38 100644 --- a/src/panfrost/bifrost/test/test-optimizer.cpp +++ b/src/panfrost/bifrost/test/test-optimizer.cpp @@ -21,9 +21,9 @@ * SOFTWARE. */ -#include "compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "compiler.h" #include @@ -38,24 +38,35 @@ bi_optimizer(bi_context *ctx) /* Define reg first so it has a consistent variable index, and pass it to an * instruction that cannot be dead code eliminated so the program is nontrivial. */ -#define CASE(instr, expected) INSTRUCTION_CASE(\ - { UNUSED bi_index reg = bi_temp(b->shader); instr; bi_kaboom(b, reg); }, \ - { UNUSED bi_index reg = bi_temp(b->shader); expected; bi_kaboom(b, reg); }, \ +#define CASE(instr, expected) \ + INSTRUCTION_CASE( \ + { \ + UNUSED bi_index reg = bi_temp(b->shader); \ + instr; \ + bi_kaboom(b, reg); \ + }, \ + { \ + UNUSED bi_index reg = bi_temp(b->shader); \ + expected; \ + bi_kaboom(b, reg); \ + }, \ bi_optimizer); #define NEGCASE(instr) CASE(instr, instr) class Optimizer : public testing::Test { -protected: - Optimizer() { + protected: + Optimizer() + { mem_ctx = ralloc_context(NULL); - x = bi_register(1); - y = bi_register(2); + x = bi_register(1); + y = bi_register(2); negabsx = bi_neg(bi_abs(x)); } - ~Optimizer() { + ~Optimizer() + { ralloc_free(mem_ctx); } @@ -95,91 +106,124 @@ TEST_F(Optimizer, FusedFABSNEGForFP16) TEST_F(Optimizer, FuseFADD_F32WithEqualSourcesAbsAbsAndClamp) { - CASE({ - bi_instr *I = bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), bi_abs(x)); + CASE( + { + bi_instr *I = + bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), bi_abs(x)); I->clamp = BI_CLAMP_CLAMP_0_1; - }, { + }, + { bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x)); I->clamp = BI_CLAMP_CLAMP_0_1; - }); + }); - CASE({ - bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_fabsneg_f32(b, bi_abs(x))); + CASE( + { + bi_instr *I = + bi_fadd_f32_to(b, reg, bi_abs(x), bi_fabsneg_f32(b, bi_abs(x))); I->clamp = BI_CLAMP_CLAMP_0_1; - }, { + }, + { bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x)); I->clamp = BI_CLAMP_CLAMP_0_1; - }); + }); - CASE({ - bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), bi_abs(x))); + CASE( + { + bi_instr *I = + bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), bi_abs(x))); I->clamp = BI_CLAMP_CLAMP_0_INF; - }, { + }, + { bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x)); I->clamp = BI_CLAMP_CLAMP_0_INF; - }); + }); } TEST_F(Optimizer, FuseFADD_V2F16WithDifferentSourcesAbsAbsAndClamp) { - CASE({ - bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(y)); + CASE( + { + bi_instr *I = + bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(y)); I->clamp = BI_CLAMP_CLAMP_0_1; - }, { + }, + { bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y)); I->clamp = BI_CLAMP_CLAMP_0_1; - }); + }); - CASE({ - bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(y))); + CASE( + { + bi_instr *I = + bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(y))); I->clamp = BI_CLAMP_CLAMP_0_1; - }, { + }, + { bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y)); I->clamp = BI_CLAMP_CLAMP_0_1; - }); + }); - CASE({ - bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(y))); + CASE( + { + bi_instr *I = + bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(y))); I->clamp = BI_CLAMP_CLAMP_0_INF; - }, { + }, + { bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y)); I->clamp = BI_CLAMP_CLAMP_0_INF; - }); + }); } TEST_F(Optimizer, AvoidFADD_V2F16WithEqualSourcesAbsAbsAndClamp) { NEGCASE({ - bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(x)); - I->clamp = BI_CLAMP_CLAMP_0_1; + bi_instr *I = + bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(x)); + I->clamp = BI_CLAMP_CLAMP_0_1; }); NEGCASE({ - bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(x))); - I->clamp = BI_CLAMP_CLAMP_0_1; + bi_instr *I = + bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(x))); + I->clamp = BI_CLAMP_CLAMP_0_1; }); NEGCASE({ - bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(x))); + bi_instr *I = + bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(x))); I->clamp = BI_CLAMP_CLAMP_0_INF; }); } TEST_F(Optimizer, SwizzlesComposedForFP16) { - CASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), y), + CASE(bi_fadd_v2f16_to( + b, reg, bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), y), bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y)); - CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, negabsx), true, false), y), + CASE(bi_fadd_v2f16_to( + b, reg, bi_swz_16(bi_fabsneg_v2f16(b, negabsx), true, false), y), bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y)); - CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), true, false), y), + CASE(bi_fadd_v2f16_to( + b, reg, + bi_swz_16(bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), true, + false), + y), bi_fadd_v2f16_to(b, reg, negabsx, y)); - CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, false)), true, false), y), + CASE(bi_fadd_v2f16_to( + b, reg, + bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, false)), true, false), + y), bi_fadd_v2f16_to(b, reg, bi_half(negabsx, false), y)); - CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, true)), true, false), y), + CASE(bi_fadd_v2f16_to( + b, reg, + bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, true)), true, false), + y), bi_fadd_v2f16_to(b, reg, bi_half(negabsx, true), y)); } @@ -192,7 +236,8 @@ TEST_F(Optimizer, PreserveWidens) CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(negabsx, true)), y), bi_fadd_f32_to(b, reg, bi_half(negabsx, true), y)); - CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(x, true)), bi_fabsneg_f32(b, bi_half(x, false))), + CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(x, true)), + bi_fabsneg_f32(b, bi_half(x, false))), bi_fadd_f32_to(b, reg, bi_half(x, true), bi_half(x, false))); } @@ -219,85 +264,100 @@ TEST_F(Optimizer, AvoidZeroAndFABSNEGFootguns) TEST_F(Optimizer, ClampsPropagated) { - CASE({ - bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, x, y)); - I->clamp = BI_CLAMP_CLAMP_0_INF; - }, { - bi_instr *I = bi_fadd_f32_to(b, reg, x, y); - I->clamp = BI_CLAMP_CLAMP_0_INF; - }); + CASE( + { + bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, x, y)); + I->clamp = BI_CLAMP_CLAMP_0_INF; + }, + { + bi_instr *I = bi_fadd_f32_to(b, reg, x, y); + I->clamp = BI_CLAMP_CLAMP_0_INF; + }); - CASE({ - bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y)); - I->clamp = BI_CLAMP_CLAMP_0_1; - }, { - bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y); - I->clamp = BI_CLAMP_CLAMP_0_1; - }); + CASE( + { + bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y)); + I->clamp = BI_CLAMP_CLAMP_0_1; + }, + { + bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y); + I->clamp = BI_CLAMP_CLAMP_0_1; + }); } - TEST_F(Optimizer, ClampsComposed) { - CASE({ - bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y); - bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]); - I->clamp = BI_CLAMP_CLAMP_M1_1; - J->clamp = BI_CLAMP_CLAMP_0_INF; - }, { - bi_instr *I = bi_fadd_f32_to(b, reg, x, y); - I->clamp = BI_CLAMP_CLAMP_0_1; - }); + CASE( + { + bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y); + bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]); + I->clamp = BI_CLAMP_CLAMP_M1_1; + J->clamp = BI_CLAMP_CLAMP_0_INF; + }, + { + bi_instr *I = bi_fadd_f32_to(b, reg, x, y); + I->clamp = BI_CLAMP_CLAMP_0_1; + }); - CASE({ - bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y); - bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]); - I->clamp = BI_CLAMP_CLAMP_0_1; - J->clamp = BI_CLAMP_CLAMP_0_INF; - }, { - bi_instr *I = bi_fadd_f32_to(b, reg, x, y); - I->clamp = BI_CLAMP_CLAMP_0_1; - }); + CASE( + { + bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y); + bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]); + I->clamp = BI_CLAMP_CLAMP_0_1; + J->clamp = BI_CLAMP_CLAMP_0_INF; + }, + { + bi_instr *I = bi_fadd_f32_to(b, reg, x, y); + I->clamp = BI_CLAMP_CLAMP_0_1; + }); - CASE({ - bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y); - bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]); - I->clamp = BI_CLAMP_CLAMP_0_INF; - J->clamp = BI_CLAMP_CLAMP_0_INF; - }, { - bi_instr *I = bi_fadd_f32_to(b, reg, x, y); - I->clamp = BI_CLAMP_CLAMP_0_INF; - }); + CASE( + { + bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y); + bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]); + I->clamp = BI_CLAMP_CLAMP_0_INF; + J->clamp = BI_CLAMP_CLAMP_0_INF; + }, + { + bi_instr *I = bi_fadd_f32_to(b, reg, x, y); + I->clamp = BI_CLAMP_CLAMP_0_INF; + }); - CASE({ - bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y); - bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]); - I->clamp = BI_CLAMP_CLAMP_M1_1; - J->clamp = BI_CLAMP_CLAMP_0_INF; - }, { - bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y); - I->clamp = BI_CLAMP_CLAMP_0_1; - }); + CASE( + { + bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y); + bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]); + I->clamp = BI_CLAMP_CLAMP_M1_1; + J->clamp = BI_CLAMP_CLAMP_0_INF; + }, + { + bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y); + I->clamp = BI_CLAMP_CLAMP_0_1; + }); - CASE({ - bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y); - bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]); - I->clamp = BI_CLAMP_CLAMP_0_1; - J->clamp = BI_CLAMP_CLAMP_0_INF; - }, { - bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y); - I->clamp = BI_CLAMP_CLAMP_0_1; - }); + CASE( + { + bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y); + bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]); + I->clamp = BI_CLAMP_CLAMP_0_1; + J->clamp = BI_CLAMP_CLAMP_0_INF; + }, + { + bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y); + I->clamp = BI_CLAMP_CLAMP_0_1; + }); - CASE({ - bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y); - bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]); - I->clamp = BI_CLAMP_CLAMP_0_INF; - J->clamp = BI_CLAMP_CLAMP_0_INF; - }, { - bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y); - I->clamp = BI_CLAMP_CLAMP_0_INF; - }); + CASE( + { + bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y); + bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]); + I->clamp = BI_CLAMP_CLAMP_0_INF; + J->clamp = BI_CLAMP_CLAMP_0_INF; + }, + { + bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y); + I->clamp = BI_CLAMP_CLAMP_0_INF; + }); } TEST_F(Optimizer, DoNotMixSizesWhenClamping) @@ -341,21 +401,29 @@ TEST_F(Optimizer, FuseComparisonsWithDISCARD) bi_discard_f32(b, x, y, BI_CMPF_EQ)); for (unsigned h = 0; h < 2; ++h) { - CASE(bi_discard_b32(b, bi_half(bi_fcmp_v2f16(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_F1), h)), + CASE(bi_discard_b32( + b, bi_half(bi_fcmp_v2f16(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_F1), + h)), bi_discard_f32(b, bi_half(x, h), bi_half(y, h), BI_CMPF_LE)); - CASE(bi_discard_b32(b, bi_half(bi_fcmp_v2f16(b, x, y, BI_CMPF_NE, BI_RESULT_TYPE_I1), h)), + CASE(bi_discard_b32( + b, bi_half(bi_fcmp_v2f16(b, x, y, BI_CMPF_NE, BI_RESULT_TYPE_I1), + h)), bi_discard_f32(b, bi_half(x, h), bi_half(y, h), BI_CMPF_NE)); - CASE(bi_discard_b32(b, bi_half(bi_fcmp_v2f16(b, x, y, BI_CMPF_EQ, BI_RESULT_TYPE_M1), h)), + CASE(bi_discard_b32( + b, bi_half(bi_fcmp_v2f16(b, x, y, BI_CMPF_EQ, BI_RESULT_TYPE_M1), + h)), bi_discard_f32(b, bi_half(x, h), bi_half(y, h), BI_CMPF_EQ)); } } TEST_F(Optimizer, DoNotFuseSpecialComparisons) { - NEGCASE(bi_discard_b32(b, bi_fcmp_f32(b, x, y, BI_CMPF_GTLT, BI_RESULT_TYPE_F1))); - NEGCASE(bi_discard_b32(b, bi_fcmp_f32(b, x, y, BI_CMPF_TOTAL, BI_RESULT_TYPE_F1))); + NEGCASE( + bi_discard_b32(b, bi_fcmp_f32(b, x, y, BI_CMPF_GTLT, BI_RESULT_TYPE_F1))); + NEGCASE(bi_discard_b32( + b, bi_fcmp_f32(b, x, y, BI_CMPF_TOTAL, BI_RESULT_TYPE_F1))); } TEST_F(Optimizer, FuseResultType) @@ -365,25 +433,33 @@ TEST_F(Optimizer, FuseResultType) BI_MUX_INT_ZERO), bi_fcmp_f32_to(b, reg, x, y, BI_CMPF_LE, BI_RESULT_TYPE_F1)); - CASE(bi_mux_i32_to(b, reg, bi_imm_f32(0.0), bi_imm_f32(1.0), - bi_fcmp_f32(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO), - bi_fcmp_f32_to(b, reg, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_F1)); + CASE(bi_mux_i32_to( + b, reg, bi_imm_f32(0.0), bi_imm_f32(1.0), + bi_fcmp_f32(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO), + bi_fcmp_f32_to(b, reg, bi_abs(x), bi_neg(y), BI_CMPF_LE, + BI_RESULT_TYPE_F1)); - CASE(bi_mux_i32_to(b, reg, bi_imm_u32(0), bi_imm_u32(1), - bi_fcmp_f32(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO), - bi_fcmp_f32_to(b, reg, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_I1)); + CASE(bi_mux_i32_to( + b, reg, bi_imm_u32(0), bi_imm_u32(1), + bi_fcmp_f32(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO), + bi_fcmp_f32_to(b, reg, bi_abs(x), bi_neg(y), BI_CMPF_LE, + BI_RESULT_TYPE_I1)); CASE(bi_mux_v2i16_to(b, reg, bi_imm_f16(0.0), bi_imm_f16(1.0), - bi_fcmp_v2f16(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO), - bi_fcmp_v2f16_to(b, reg, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_F1)); + bi_fcmp_v2f16(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, + BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO), + bi_fcmp_v2f16_to(b, reg, bi_abs(x), bi_neg(y), BI_CMPF_LE, + BI_RESULT_TYPE_F1)); CASE(bi_mux_v2i16_to(b, reg, bi_imm_u16(0), bi_imm_u16(1), - bi_fcmp_v2f16(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO), - bi_fcmp_v2f16_to(b, reg, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_I1)); + bi_fcmp_v2f16(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, + BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO), + bi_fcmp_v2f16_to(b, reg, bi_abs(x), bi_neg(y), BI_CMPF_LE, + BI_RESULT_TYPE_I1)); CASE(bi_mux_i32_to(b, reg, bi_imm_u32(0), bi_imm_u32(1), bi_icmp_u32(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_M1), @@ -391,13 +467,13 @@ TEST_F(Optimizer, FuseResultType) bi_icmp_u32_to(b, reg, x, y, BI_CMPF_LE, BI_RESULT_TYPE_I1)); CASE(bi_mux_v2i16_to(b, reg, bi_imm_u16(0), bi_imm_u16(1), - bi_icmp_v2u16(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO), + bi_icmp_v2u16(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO), bi_icmp_v2u16_to(b, reg, x, y, BI_CMPF_LE, BI_RESULT_TYPE_I1)); CASE(bi_mux_v4i8_to(b, reg, bi_imm_u8(0), bi_imm_u8(1), - bi_icmp_v4u8(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO), + bi_icmp_v4u8(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO), bi_icmp_v4u8_to(b, reg, x, y, BI_CMPF_LE, BI_RESULT_TYPE_I1)); CASE(bi_mux_i32_to(b, reg, bi_imm_u32(0), bi_imm_u32(1), @@ -406,31 +482,36 @@ TEST_F(Optimizer, FuseResultType) bi_icmp_s32_to(b, reg, x, y, BI_CMPF_LE, BI_RESULT_TYPE_I1)); CASE(bi_mux_v2i16_to(b, reg, bi_imm_u16(0), bi_imm_u16(1), - bi_icmp_v2s16(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO), + bi_icmp_v2s16(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO), bi_icmp_v2s16_to(b, reg, x, y, BI_CMPF_LE, BI_RESULT_TYPE_I1)); CASE(bi_mux_v4i8_to(b, reg, bi_imm_u8(0), bi_imm_u8(1), - bi_icmp_v4s8(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO), + bi_icmp_v4s8(b, x, y, BI_CMPF_LE, BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO), bi_icmp_v4s8_to(b, reg, x, y, BI_CMPF_LE, BI_RESULT_TYPE_I1)); } TEST_F(Optimizer, DoNotFuseMixedSizeResultType) { - NEGCASE(bi_mux_i32_to(b, reg, bi_imm_f32(0.0), bi_imm_f32(1.0), - bi_fcmp_v2f16(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO)); + NEGCASE(bi_mux_i32_to( + b, reg, bi_imm_f32(0.0), bi_imm_f32(1.0), + bi_fcmp_v2f16(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO)); - NEGCASE(bi_mux_v2i16_to(b, reg, bi_imm_f16(0.0), bi_imm_f16(1.0), - bi_fcmp_f32(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), - BI_MUX_INT_ZERO)); + NEGCASE(bi_mux_v2i16_to( + b, reg, bi_imm_f16(0.0), bi_imm_f16(1.0), + bi_fcmp_f32(b, bi_abs(x), bi_neg(y), BI_CMPF_LE, BI_RESULT_TYPE_M1), + BI_MUX_INT_ZERO)); } TEST_F(Optimizer, VarTexCoord32) { - CASE({ - bi_index ld = bi_ld_var_imm(b, bi_null(), BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER, BI_UPDATE_STORE, BI_VECSIZE_V2, 0); + CASE( + { + bi_index ld = + bi_ld_var_imm(b, bi_null(), BI_REGISTER_FORMAT_F32, + BI_SAMPLE_CENTER, BI_UPDATE_STORE, BI_VECSIZE_V2, 0); bi_index x = bi_temp(b->shader); bi_index y = bi_temp(b->shader); @@ -439,9 +520,11 @@ TEST_F(Optimizer, VarTexCoord32) split->dest[1] = y; bi_texs_2d_f32_to(b, reg, x, y, false, 0, 0); - }, { - bi_var_tex_f32_to(b, reg, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, 0); - }); + }, + { + bi_var_tex_f32_to(b, reg, false, BI_SAMPLE_CENTER, BI_UPDATE_STORE, 0, + 0); + }); } TEST_F(Optimizer, Int8ToFloat32) @@ -458,7 +541,6 @@ TEST_F(Optimizer, Int8ToFloat32) } } - TEST_F(Optimizer, Int16ToFloat32) { for (unsigned i = 0; i < 2; ++i) { diff --git a/src/panfrost/bifrost/test/test-pack-formats.cpp b/src/panfrost/bifrost/test/test-pack-formats.cpp index 91fd474655c..f75add21197 100644 --- a/src/panfrost/bifrost/test/test-pack-formats.cpp +++ b/src/panfrost/bifrost/test/test-pack-formats.cpp @@ -21,23 +21,27 @@ * SOFTWARE. */ -#include "compiler.h" #include "bi_test.h" +#include "compiler.h" #include #include "mesa-gtest-extras.h" -class PackFormats : public testing::Test -{ -protected: - PackFormats() { +class PackFormats : public testing::Test { + protected: + PackFormats() + { util_dynarray_init(&result, NULL); } - ~PackFormats() { + ~PackFormats() + { util_dynarray_fini(&result); } - const uint64_t *result_as_u64_array() { return reinterpret_cast(result.data); } + const uint64_t *result_as_u64_array() + { + return reinterpret_cast(result.data); + } struct util_dynarray result; }; @@ -46,7 +50,7 @@ TEST_F(PackFormats, 1) { /* Test case from the blob */ struct bi_packed_tuple tuples[] = { - { 0x2380cb1c02200000, 0x10e0 }, + {0x2380cb1c02200000, 0x10e0}, }; uint64_t header = 0x021000011800; @@ -65,8 +69,8 @@ TEST_F(PackFormats, 1) TEST_F(PackFormats, 2) { struct bi_packed_tuple tuples[] = { - { 0x9380cb6044000044, 0xf65 }, - { 0xaf8721a05c000081, 0x1831 }, + {0x9380cb6044000044, 0xf65}, + {0xaf8721a05c000081, 0x1831}, }; bi_pack_format(&result, 0, tuples, 2, 0x52800011800, 0, 0, false); @@ -86,9 +90,9 @@ TEST_F(PackFormats, 2) TEST_F(PackFormats, 3) { struct bi_packed_tuple tuples[] = { - { 0x93805b8040000000, 0xf65 }, - { 0x93886db05c000000, 0xf65 }, - { 0xb380cb180c000080, 0x18b1 }, + {0x93805b8040000000, 0xf65}, + {0x93886db05c000000, 0xf65}, + {0xb380cb180c000080, 0x18b1}, }; bi_pack_format(&result, 0, tuples, 3, 0x3100000000, 0, 0, true); @@ -96,12 +100,8 @@ TEST_F(PackFormats, 3) bi_pack_format(&result, 4, tuples, 3, 0x3100000000, 0, 0, true); const uint64_t expected[] = { - 0x805b804000000029, - 0x0188000000076593, - 0x886db05c00000021, - 0x58c0600004076593, - 0x0000000000000044, - 0x60002c6ce0300000, + 0x805b804000000029, 0x0188000000076593, 0x886db05c00000021, + 0x58c0600004076593, 0x0000000000000044, 0x60002c6ce0300000, }; ASSERT_EQ(result.size, 48); @@ -111,10 +111,10 @@ TEST_F(PackFormats, 3) TEST_F(PackFormats, 4) { struct bi_packed_tuple tuples[] = { - { 0xad8c87004000005f, 0x2f18 }, - { 0xad8c87385c00004f, 0x2f18 }, - { 0xad8c87385c00006e, 0x2f18 }, - { 0xb380cb182c000080, 0x18b1 }, + {0xad8c87004000005f, 0x2f18}, + {0xad8c87385c00004f, 0x2f18}, + {0xad8c87385c00006e, 0x2f18}, + {0xb380cb182c000080, 0x18b1}, }; uint64_t EC0 = (0x10000001ff000000) >> 4; @@ -124,12 +124,8 @@ TEST_F(PackFormats, 4) bi_pack_format(&result, 6, tuples, 4, 0x3100000000, EC0, 0, false); const uint64_t expected[] = { - 0x8c87004000005f2d, - 0x01880000000718ad, - 0x8c87385c00004f25, - 0x39c2e000037718ad, - 0x80cb182c00008005, - 0xac01c62b6320b1b3, + 0x8c87004000005f2d, 0x01880000000718ad, 0x8c87385c00004f25, + 0x39c2e000037718ad, 0x80cb182c00008005, 0xac01c62b6320b1b3, }; ASSERT_EQ(result.size, 48); @@ -139,11 +135,9 @@ TEST_F(PackFormats, 4) TEST_F(PackFormats, 5) { struct bi_packed_tuple tuples[] = { - { 0x9380688040000000, 0xf65 }, - { 0xd4057300c000040, 0xf26 }, - { 0x1f80cb1858000000, 0x19ab }, - { 0x937401f85c000000, 0xf65 }, - { 0xb380cb180c000080, 0x18a1 }, + {0x9380688040000000, 0xf65}, {0xd4057300c000040, 0xf26}, + {0x1f80cb1858000000, 0x19ab}, {0x937401f85c000000, 0xf65}, + {0xb380cb180c000080, 0x18a1}, }; uint64_t EC0 = (0x183f800000) >> 4; @@ -154,14 +148,9 @@ TEST_F(PackFormats, 5) bi_pack_format(&result, 8, tuples, 5, 0x3100000000, EC0, 0, true); const uint64_t expected[] = { - 0x8068804000000029, - 0x0188000000076593, - 0x4057300c00004021, - 0x58c2c0000007260d, - 0x7401f85c0000008b, - 0x00006ac7e0376593, - 0x80cb180c00008053, - 0x000000183f80a1b3, + 0x8068804000000029, 0x0188000000076593, 0x4057300c00004021, + 0x58c2c0000007260d, 0x7401f85c0000008b, 0x00006ac7e0376593, + 0x80cb180c00008053, 0x000000183f80a1b3, }; ASSERT_EQ(result.size, 64); @@ -171,12 +160,9 @@ TEST_F(PackFormats, 5) TEST_F(PackFormats, 6) { struct bi_packed_tuple tuples[] = { - { 0xad8c870068000048, 0x2f18 }, - { 0xad8c87385c000050, 0x2f18 }, - { 0xad8c87385c00006a, 0x2f18 }, - { 0xad8c87385c000074, 0x2f18 }, - { 0xad8c87385c000020, 0x2f18 }, - { 0xad8c87385c000030, 0x2f18 }, + {0xad8c870068000048, 0x2f18}, {0xad8c87385c000050, 0x2f18}, + {0xad8c87385c00006a, 0x2f18}, {0xad8c87385c000074, 0x2f18}, + {0xad8c87385c000020, 0x2f18}, {0xad8c87385c000030, 0x2f18}, }; uint64_t EC0 = (0x345678912345670) >> 4; @@ -188,15 +174,9 @@ TEST_F(PackFormats, 6) bi_pack_format(&result, 10, tuples, 6, 0x60000011800, EC0, 0, false); const uint64_t expected[] = { - 0x8c8700680000482d, - 0x30000008c00718ad, - 0x8c87385c00005025, - 0x39c2e000035718ad, - 0x8c87385c00007401, - 0xb401c62b632718ad, - 0x8c87385c00002065, - 0x39c2e000018718ad, - 0x3456789123456706, + 0x8c8700680000482d, 0x30000008c00718ad, 0x8c87385c00005025, + 0x39c2e000035718ad, 0x8c87385c00007401, 0xb401c62b632718ad, + 0x8c87385c00002065, 0x39c2e000018718ad, 0x3456789123456706, 0xa001c62b63200000, }; @@ -207,13 +187,10 @@ TEST_F(PackFormats, 6) TEST_F(PackFormats, 7) { struct bi_packed_tuple tuples[] = { - { 0x9020074040000083, 0xf65 }, - { 0x90000d4058100080, 0xf65 }, - { 0x90000a3058700082, 0xf65 }, - { 0x9020074008114581, 0xf65 }, - { 0x90000d0058000080, 0xf65 }, - { 0x9000083058700082, 0xf65 }, - { 0x2380cb199ac38400, 0x327a }, + {0x9020074040000083, 0xf65}, {0x90000d4058100080, 0xf65}, + {0x90000a3058700082, 0xf65}, {0x9020074008114581, 0xf65}, + {0x90000d0058000080, 0xf65}, {0x9000083058700082, 0xf65}, + {0x2380cb199ac38400, 0x327a}, }; bi_pack_format(&result, 0, tuples, 7, 0x3000100000, 0, 0, true); @@ -223,15 +200,9 @@ TEST_F(PackFormats, 7) bi_pack_format(&result, 11, tuples, 7, 0x3000100000, 0, 0, true); const uint64_t expected[] = { - 0x2007404000008329, - 0x0180008000076590, - 0x000d405810008021, - 0x5182c38004176590, - 0x2007400811458101, - 0x2401d96400076590, - 0x000d005800008061, - 0x4182c38004176590, - 0x80cb199ac3840047, + 0x2007404000008329, 0x0180008000076590, 0x000d405810008021, + 0x5182c38004176590, 0x2007400811458101, 0x2401d96400076590, + 0x000d005800008061, 0x4182c38004176590, 0x80cb199ac3840047, 0x3801d96400027a23, }; @@ -242,14 +213,10 @@ TEST_F(PackFormats, 7) TEST_F(PackFormats, 8) { struct bi_packed_tuple tuples[] = { - { 0x442087037a2f8643, 0x3021 }, - { 0x84008d0586100043, 0x200 }, - { 0x7c008d0028014543, 0x0 }, - { 0x1c00070058200081, 0x1980 }, - { 0x1600dd878320400, 0x200 }, - { 0x49709c1b08308900, 0x200 }, - { 0x6c2007807881ca00, 0x40 }, - { 0x8d70fc0d94900083, 0x800 }, + {0x442087037a2f8643, 0x3021}, {0x84008d0586100043, 0x200}, + {0x7c008d0028014543, 0x0}, {0x1c00070058200081, 0x1980}, + {0x1600dd878320400, 0x200}, {0x49709c1b08308900, 0x200}, + {0x6c2007807881ca00, 0x40}, {0x8d70fc0d94900083, 0x800}, }; uint64_t EC0 = (0x32e635d0) >> 4; @@ -262,18 +229,10 @@ TEST_F(PackFormats, 8) bi_pack_format(&result, 13, tuples, 8, 0x61001311800, EC0, 0, true); const uint64_t expected[] = { - 0x2087037a2f86432e, - 0x30800988c0002144, - 0x008d058610004320, - 0x6801400a2a1a0084, - 0x0007005820008101, - 0x0c00001f0021801c, - 0x600dd87832040060, - 0xe0d8418448020001, - 0x2007807881ca00c0, - 0xc6ba80125c20406c, - 0x70fc0d9490008359, - 0x0000000032e0008d, + 0x2087037a2f86432e, 0x30800988c0002144, 0x008d058610004320, + 0x6801400a2a1a0084, 0x0007005820008101, 0x0c00001f0021801c, + 0x600dd87832040060, 0xe0d8418448020001, 0x2007807881ca00c0, + 0xc6ba80125c20406c, 0x70fc0d9490008359, 0x0000000032e0008d, }; ASSERT_EQ(result.size, 96); diff --git a/src/panfrost/bifrost/test/test-packing.cpp b/src/panfrost/bifrost/test/test-packing.cpp index e876368b997..27cbbab26b4 100644 --- a/src/panfrost/bifrost/test/test-packing.cpp +++ b/src/panfrost/bifrost/test/test-packing.cpp @@ -39,14 +39,9 @@ TEST(Packing, PackLiteral) TEST(Packing, PackUpper) { struct bi_packed_tuple tuples[] = { - { 0, 0x3 << (75 - 64) }, - { 0, 0x1 << (75 - 64) }, - { 0, 0x7 << (75 - 64) }, - { 0, 0x0 << (75 - 64) }, - { 0, 0x2 << (75 - 64) }, - { 0, 0x6 << (75 - 64) }, - { 0, 0x5 << (75 - 64) }, - { 0, 0x4 << (75 - 64) }, + {0, 0x3 << (75 - 64)}, {0, 0x1 << (75 - 64)}, {0, 0x7 << (75 - 64)}, + {0, 0x0 << (75 - 64)}, {0, 0x2 << (75 - 64)}, {0, 0x6 << (75 - 64)}, + {0, 0x5 << (75 - 64)}, {0, 0x4 << (75 - 64)}, }; EXPECT_EQ(bi_pack_upper(U(0), tuples, 8), 3); @@ -62,9 +57,9 @@ TEST(Packing, PackUpper) TEST(Packing, PackTupleBits) { struct bi_packed_tuple tuples[] = { - { 0x1234567801234567, 0x3A }, - { 0x9876543299999999, 0x1B }, - { 0xABCDEF0101234567, 0x7C }, + {0x1234567801234567, 0x3A}, + {0x9876543299999999, 0x1B}, + {0xABCDEF0101234567, 0x7C}, }; EXPECT_EQ(bi_pack_tuple_bits(T(0), tuples, 8, 0, 30), 0x01234567); @@ -75,19 +70,14 @@ TEST(Packing, PackTupleBits) TEST(Packing, PackSync) { struct bi_packed_tuple tuples[] = { - { 0, 0x3 << (75 - 64) }, - { 0, 0x5 << (75 - 64) }, - { 0, 0x7 << (75 - 64) }, - { 0, 0x0 << (75 - 64) }, - { 0, 0x2 << (75 - 64) }, - { 0, 0x6 << (75 - 64) }, - { 0, 0x5 << (75 - 64) }, - { 0, 0x4 << (75 - 64) }, + {0, 0x3 << (75 - 64)}, {0, 0x5 << (75 - 64)}, {0, 0x7 << (75 - 64)}, + {0, 0x0 << (75 - 64)}, {0, 0x2 << (75 - 64)}, {0, 0x6 << (75 - 64)}, + {0, 0x5 << (75 - 64)}, {0, 0x4 << (75 - 64)}, }; EXPECT_EQ(bi_pack_sync(L(3), L(1), L(7), tuples, 8, false), 0xCF); EXPECT_EQ(bi_pack_sync(L(3), L(1), U(7), tuples, 8, false), 0xCC); EXPECT_EQ(bi_pack_sync(L(3), U(1), U(7), tuples, 8, false), 0xEC); - EXPECT_EQ(bi_pack_sync(Z, U(1), U(7), tuples, 8, false), 0x2C); - EXPECT_EQ(bi_pack_sync(Z, U(1), U(7), tuples, 8, true) , 0x6C); + EXPECT_EQ(bi_pack_sync(Z, U(1), U(7), tuples, 8, false), 0x2C); + EXPECT_EQ(bi_pack_sync(Z, U(1), U(7), tuples, 8, true), 0x6C); } diff --git a/src/panfrost/bifrost/test/test-scheduler-predicates.cpp b/src/panfrost/bifrost/test/test-scheduler-predicates.cpp index 7b7e138ebff..bd7c0fd038c 100644 --- a/src/panfrost/bifrost/test/test-scheduler-predicates.cpp +++ b/src/panfrost/bifrost/test/test-scheduler-predicates.cpp @@ -21,23 +21,28 @@ * SOFTWARE. */ -#include "compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "compiler.h" #include class SchedulerPredicates : public testing::Test { -protected: - SchedulerPredicates() { + protected: + SchedulerPredicates() + { mem_ctx = ralloc_context(NULL); b = bit_builder(mem_ctx); } - ~SchedulerPredicates() { + ~SchedulerPredicates() + { ralloc_free(mem_ctx); } - bi_index TMP() { return bi_temp(b->shader); } + bi_index TMP() + { + return bi_temp(b->shader); + } void *mem_ctx; bi_builder *b; diff --git a/src/panfrost/bifrost/valhall/disassemble.h b/src/panfrost/bifrost/valhall/disassemble.h index 1840268ba98..f23a416a0b3 100644 --- a/src/panfrost/bifrost/valhall/disassemble.h +++ b/src/panfrost/bifrost/valhall/disassemble.h @@ -1,21 +1,21 @@ #ifndef __DISASM_H #define __DISASM_H -#include -#include -#include #include +#include #include +#include +#include #include #include -#define BIT(b) (1ull << (b)) -#define MASK(count) ((1ull << (count)) - 1) +#define BIT(b) (1ull << (b)) +#define MASK(count) ((1ull << (count)) - 1) #define SEXT(b, count) ((b ^ BIT(count - 1)) - BIT(count - 1)) -#define UNUSED __attribute__((unused)) +#define UNUSED __attribute__((unused)) #define VA_SRC_UNIFORM_TYPE 0x2 -#define VA_SRC_IMM_TYPE 0x3 +#define VA_SRC_IMM_TYPE 0x3 static inline void va_print_dest(FILE *fp, uint8_t dest, bool can_mask) @@ -51,7 +51,7 @@ disassemble_valhall(FILE *fp, const uint64_t *code, unsigned size, bool verbose) if (verbose) { /* Print byte pattern */ for (unsigned j = 0; j < 8; ++j) - fprintf(fp, "%02x ", (uint8_t) (instr >> (j * 8))); + fprintf(fp, "%02x ", (uint8_t)(instr >> (j * 8))); fprintf(fp, " "); } else { diff --git a/src/panfrost/bifrost/valhall/test/test-add-imm.cpp b/src/panfrost/bifrost/valhall/test/test-add-imm.cpp index f5e121df1b9..f9e4adea2c3 100644 --- a/src/panfrost/bifrost/valhall/test/test-add-imm.cpp +++ b/src/panfrost/bifrost/valhall/test/test-add-imm.cpp @@ -21,10 +21,10 @@ * SOFTWARE. */ -#include "va_compiler.h" -#include "bi_test.h" -#include "bi_builder.h" #include "util/u_cpu_detect.h" +#include "bi_builder.h" +#include "bi_test.h" +#include "va_compiler.h" #include @@ -37,102 +37,137 @@ add_imm(bi_context *ctx) } #define CASE(instr, expected) INSTRUCTION_CASE(instr, expected, add_imm) -#define NEGCASE(instr) CASE(instr, instr) +#define NEGCASE(instr) CASE(instr, instr) class AddImm : public testing::Test { -protected: - AddImm() { + protected: + AddImm() + { mem_ctx = ralloc_context(NULL); } - ~AddImm() { + ~AddImm() + { ralloc_free(mem_ctx); } void *mem_ctx; }; - -TEST_F(AddImm, Basic) { +TEST_F(AddImm, Basic) +{ CASE(bi_mov_i32_to(b, bi_register(63), bi_imm_u32(0xABAD1DEA)), bi_iadd_imm_i32_to(b, bi_register(63), bi_zero(), 0xABAD1DEA)); CASE(bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0)), bi_fadd_imm_f32_to(b, bi_register(1), bi_register(2), fui(42.0))); - CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_imm_f32(42.0)), - bi_fadd_imm_f32_to(b, bi_register(1), bi_discard(bi_register(2)), fui(42.0))); + CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), + bi_imm_f32(42.0)), + bi_fadd_imm_f32_to(b, bi_register(1), bi_discard(bi_register(2)), + fui(42.0))); - CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_neg(bi_imm_f32(42.0))), - bi_fadd_imm_f32_to(b, bi_register(1), bi_discard(bi_register(2)), fui(-42.0))); + CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), + bi_neg(bi_imm_f32(42.0))), + bi_fadd_imm_f32_to(b, bi_register(1), bi_discard(bi_register(2)), + fui(-42.0))); } -TEST_F(AddImm, Commutativty) { +TEST_F(AddImm, Commutativty) +{ CASE(bi_fadd_f32_to(b, bi_register(1), bi_imm_f32(42.0), bi_register(2)), bi_fadd_imm_f32_to(b, bi_register(1), bi_register(2), fui(42.0))); } -TEST_F(AddImm, NoModifiers) { - NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_abs(bi_register(2)), bi_imm_f32(42.0))); - NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_neg(bi_register(2)), bi_imm_f32(42.0))); - NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_f32(42.0))); +TEST_F(AddImm, NoModifiers) +{ + NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_abs(bi_register(2)), + bi_imm_f32(42.0))); + NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_neg(bi_register(2)), + bi_imm_f32(42.0))); + NEGCASE(bi_fadd_f32_to(b, bi_register(1), + bi_swz_16(bi_register(2), false, false), + bi_imm_f32(42.0))); } -TEST_F(AddImm, NoClamp) { +TEST_F(AddImm, NoClamp) +{ NEGCASE({ - bi_instr *I = bi_fadd_f32_to(b, bi_register(1), bi_register(2), - bi_imm_f32(42.0)); + bi_instr *I = + bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0)); I->clamp = BI_CLAMP_CLAMP_M1_1; }); } -TEST_F(AddImm, OtherTypes) { +TEST_F(AddImm, OtherTypes) +{ CASE(bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0)), bi_fadd_imm_v2f16_to(b, bi_register(1), bi_register(2), 0x51405140)); - CASE(bi_iadd_u32_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false), + CASE(bi_iadd_u32_to(b, bi_register(1), bi_register(2), + bi_imm_u32(0xDEADBEEF), false), bi_iadd_imm_i32_to(b, bi_register(1), bi_register(2), 0xDEADBEEF)); - CASE(bi_iadd_v2u16_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false), + CASE(bi_iadd_v2u16_to(b, bi_register(1), bi_register(2), + bi_imm_u32(0xDEADBEEF), false), bi_iadd_imm_v2i16_to(b, bi_register(1), bi_register(2), 0xDEADBEEF)); - CASE(bi_iadd_v4u8_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false), + CASE(bi_iadd_v4u8_to(b, bi_register(1), bi_register(2), + bi_imm_u32(0xDEADBEEF), false), bi_iadd_imm_v4i8_to(b, bi_register(1), bi_register(2), 0xDEADBEEF)); - CASE(bi_iadd_s32_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false), + CASE(bi_iadd_s32_to(b, bi_register(1), bi_register(2), + bi_imm_u32(0xDEADBEEF), false), bi_iadd_imm_i32_to(b, bi_register(1), bi_register(2), 0xDEADBEEF)); - CASE(bi_iadd_v2s16_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false), + CASE(bi_iadd_v2s16_to(b, bi_register(1), bi_register(2), + bi_imm_u32(0xDEADBEEF), false), bi_iadd_imm_v2i16_to(b, bi_register(1), bi_register(2), 0xDEADBEEF)); - CASE(bi_iadd_v4s8_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false), + CASE(bi_iadd_v4s8_to(b, bi_register(1), bi_register(2), + bi_imm_u32(0xDEADBEEF), false), bi_iadd_imm_v4i8_to(b, bi_register(1), bi_register(2), 0xDEADBEEF)); - NEGCASE(bi_iadd_u32_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_u32(0xDEADBEEF), false)); - NEGCASE(bi_iadd_v2u16_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_u32(0xDEADBEEF), false)); - NEGCASE(bi_iadd_u32_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), true)); - NEGCASE(bi_iadd_s32_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_u32(0xDEADBEEF), false)); - NEGCASE(bi_iadd_v2s16_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_u32(0xDEADBEEF), false)); + NEGCASE(bi_iadd_u32_to(b, bi_register(1), + bi_swz_16(bi_register(2), false, false), + bi_imm_u32(0xDEADBEEF), false)); + NEGCASE(bi_iadd_v2u16_to(b, bi_register(1), + bi_swz_16(bi_register(2), false, false), + bi_imm_u32(0xDEADBEEF), false)); + NEGCASE(bi_iadd_u32_to(b, bi_register(1), bi_register(2), + bi_imm_u32(0xDEADBEEF), true)); + NEGCASE(bi_iadd_s32_to(b, bi_register(1), + bi_swz_16(bi_register(2), false, false), + bi_imm_u32(0xDEADBEEF), false)); + NEGCASE(bi_iadd_v2s16_to(b, bi_register(1), + bi_swz_16(bi_register(2), false, false), + bi_imm_u32(0xDEADBEEF), false)); - NEGCASE(bi_iadd_s32_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), true)); + NEGCASE(bi_iadd_s32_to(b, bi_register(1), bi_register(2), + bi_imm_u32(0xDEADBEEF), true)); } -TEST_F(AddImm, Int8) { +TEST_F(AddImm, Int8) +{ bi_index idx = bi_register(2); idx.swizzle = BI_SWIZZLE_B0000; - NEGCASE(bi_iadd_v4u8_to(b, bi_register(1), idx, bi_imm_u32(0xDEADBEEF), false)); - NEGCASE(bi_iadd_v4s8_to(b, bi_register(1), idx, bi_imm_u32(0xDEADBEEF), false)); + NEGCASE( + bi_iadd_v4u8_to(b, bi_register(1), idx, bi_imm_u32(0xDEADBEEF), false)); + NEGCASE( + bi_iadd_v4s8_to(b, bi_register(1), idx, bi_imm_u32(0xDEADBEEF), false)); } -TEST_F(AddImm, OnlyRTE) { +TEST_F(AddImm, OnlyRTE) +{ NEGCASE({ - bi_instr *I = bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0)); - I->round = BI_ROUND_RTP; + bi_instr *I = + bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0)); + I->round = BI_ROUND_RTP; }); NEGCASE({ - bi_instr *I = bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0)); - I->round = BI_ROUND_RTZ; + bi_instr *I = + bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0)); + I->round = BI_ROUND_RTZ; }); } - diff --git a/src/panfrost/bifrost/valhall/test/test-disassembler.c b/src/panfrost/bifrost/valhall/test/test-disassembler.c index 9be708e86c3..7b10bad38f0 100644 --- a/src/panfrost/bifrost/valhall/test/test-disassembler.c +++ b/src/panfrost/bifrost/valhall/test/test-disassembler.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include #include +#include #include "disassemble.h" static inline uint8_t @@ -39,7 +39,7 @@ parse_hex(const char *in) for (unsigned i = 0; i < 8; ++i) { uint8_t byte = (parse_nibble(in[0]) << 4) | parse_nibble(in[1]); - v |= ((uint64_t) byte) << (8 * i); + v |= ((uint64_t)byte) << (8 * i); /* Skip the space after the byte */ in += 3; diff --git a/src/panfrost/bifrost/valhall/test/test-insert-flow.cpp b/src/panfrost/bifrost/valhall/test/test-insert-flow.cpp index 228eee34635..a9703c1c996 100644 --- a/src/panfrost/bifrost/valhall/test/test-insert-flow.cpp +++ b/src/panfrost/bifrost/valhall/test/test-insert-flow.cpp @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" #include "va_compiler.h" #include "valhall_enums.h" @@ -37,177 +37,190 @@ strip_nops(bi_context *ctx) } } -#define CASE(shader_stage, test) do { \ - bi_builder *A = bit_builder(mem_ctx); \ - bi_builder *B = bit_builder(mem_ctx); \ - { \ - UNUSED bi_builder *b = A; \ - A->shader->stage = MESA_SHADER_ ## shader_stage; \ - test; \ - } \ - strip_nops(A->shader); \ - va_insert_flow_control_nops(A->shader); \ - { \ - UNUSED bi_builder *b = B; \ - B->shader->stage = MESA_SHADER_ ## shader_stage; \ - test; \ - } \ - ASSERT_SHADER_EQUAL(A->shader, B->shader); \ -} while(0) +#define CASE(shader_stage, test) \ + do { \ + bi_builder *A = bit_builder(mem_ctx); \ + bi_builder *B = bit_builder(mem_ctx); \ + { \ + UNUSED bi_builder *b = A; \ + A->shader->stage = MESA_SHADER_##shader_stage; \ + test; \ + } \ + strip_nops(A->shader); \ + va_insert_flow_control_nops(A->shader); \ + { \ + UNUSED bi_builder *b = B; \ + B->shader->stage = MESA_SHADER_##shader_stage; \ + test; \ + } \ + ASSERT_SHADER_EQUAL(A->shader, B->shader); \ + } while (0) -#define flow(f) bi_nop(b)->flow = VA_FLOW_ ## f +#define flow(f) bi_nop(b)->flow = VA_FLOW_##f class InsertFlow : public testing::Test { -protected: - InsertFlow() { + protected: + InsertFlow() + { mem_ctx = ralloc_context(NULL); } - ~InsertFlow() { + ~InsertFlow() + { ralloc_free(mem_ctx); } void *mem_ctx; }; -TEST_F(InsertFlow, PreserveEmptyShader) { +TEST_F(InsertFlow, PreserveEmptyShader) +{ CASE(FRAGMENT, {}); } -TEST_F(InsertFlow, TilebufferWait7) { +TEST_F(InsertFlow, TilebufferWait7) +{ CASE(FRAGMENT, { - flow(DISCARD); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(WAIT); - bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), - bi_register(6), bi_register(7), bi_register(8), - BI_REGISTER_FORMAT_AUTO, 4, 4); - flow(END); + flow(DISCARD); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(WAIT); + bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), + bi_register(6), bi_register(7), bi_register(8), + BI_REGISTER_FORMAT_AUTO, 4, 4); + flow(END); }); CASE(FRAGMENT, { - flow(DISCARD); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(WAIT); - bi_st_tile(b, bi_register(0), bi_register(4), bi_register(5), + flow(DISCARD); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(WAIT); + bi_st_tile(b, bi_register(0), bi_register(4), bi_register(5), + bi_register(6), BI_REGISTER_FORMAT_AUTO, BI_VECSIZE_V4); + flow(END); + }); + + CASE(FRAGMENT, { + flow(DISCARD); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(WAIT); + bi_ld_tile_to(b, bi_register(0), bi_register(4), bi_register(5), bi_register(6), BI_REGISTER_FORMAT_AUTO, BI_VECSIZE_V4); - flow(END); - }); - - CASE(FRAGMENT, { - flow(DISCARD); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(WAIT); - bi_ld_tile_to(b, bi_register(0), bi_register(4), bi_register(5), - bi_register(6), BI_REGISTER_FORMAT_AUTO, BI_VECSIZE_V4); - flow(END); + flow(END); }); } -TEST_F(InsertFlow, AtestWait6AndWait0After) { +TEST_F(InsertFlow, AtestWait6AndWait0After) +{ CASE(FRAGMENT, { - flow(DISCARD); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(WAIT0126); - bi_atest_to(b, bi_register(0), bi_register(4), bi_register(5), - bi_fau(BIR_FAU_ATEST_PARAM, false)); - flow(WAIT0); - flow(END); + flow(DISCARD); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(WAIT0126); + bi_atest_to(b, bi_register(0), bi_register(4), bi_register(5), + bi_fau(BIR_FAU_ATEST_PARAM, false)); + flow(WAIT0); + flow(END); }); } -TEST_F(InsertFlow, ZSEmitWait6) { +TEST_F(InsertFlow, ZSEmitWait6) +{ CASE(FRAGMENT, { - flow(DISCARD); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(WAIT0126); - bi_zs_emit_to(b, bi_register(0), bi_register(4), bi_register(5), - bi_register(6), true, true); - flow(END); + flow(DISCARD); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(WAIT0126); + bi_zs_emit_to(b, bi_register(0), bi_register(4), bi_register(5), + bi_register(6), true, true); + flow(END); }); } -TEST_F(InsertFlow, LoadThenUnrelatedThenUse) { +TEST_F(InsertFlow, LoadThenUnrelatedThenUse) +{ CASE(VERTEX, { - bi_ld_attr_imm_to(b, bi_register(16), bi_register(60), bi_register(61), - BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 1); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(WAIT0); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(19)); - flow(END); + bi_ld_attr_imm_to(b, bi_register(16), bi_register(60), bi_register(61), + BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 1); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(WAIT0); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(19)); + flow(END); }); } -TEST_F(InsertFlow, SingleLdVar) { +TEST_F(InsertFlow, SingleLdVar) +{ CASE(FRAGMENT, { - flow(DISCARD); - bi_ld_var_buf_imm_f16_to(b, bi_register(2), bi_register(61), - BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, - BI_SOURCE_FORMAT_F16, - BI_UPDATE_RETRIEVE, BI_VECSIZE_V4, 0); - flow(WAIT0); - flow(END); + flow(DISCARD); + bi_ld_var_buf_imm_f16_to(b, bi_register(2), bi_register(61), + BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, + BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE, + BI_VECSIZE_V4, 0); + flow(WAIT0); + flow(END); }); } -TEST_F(InsertFlow, SerializeLdVars) { +TEST_F(InsertFlow, SerializeLdVars) +{ CASE(FRAGMENT, { - flow(DISCARD); - bi_ld_var_buf_imm_f16_to(b, bi_register(16), bi_register(61), - BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, - BI_SOURCE_FORMAT_F16, - BI_UPDATE_STORE, BI_VECSIZE_V4, 0); - bi_ld_var_buf_imm_f16_to(b, bi_register(2), bi_register(61), - BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, - BI_SOURCE_FORMAT_F16, - BI_UPDATE_RETRIEVE, BI_VECSIZE_V4, 0); - flow(WAIT0); - bi_ld_var_buf_imm_f16_to(b, bi_register(8), bi_register(61), - BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, - BI_SOURCE_FORMAT_F16, - BI_UPDATE_STORE, BI_VECSIZE_V4, 1); - flow(WAIT0); - flow(END); + flow(DISCARD); + bi_ld_var_buf_imm_f16_to(b, bi_register(16), bi_register(61), + BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, + BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, + BI_VECSIZE_V4, 0); + bi_ld_var_buf_imm_f16_to(b, bi_register(2), bi_register(61), + BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, + BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE, + BI_VECSIZE_V4, 0); + flow(WAIT0); + bi_ld_var_buf_imm_f16_to(b, bi_register(8), bi_register(61), + BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, + BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, + BI_VECSIZE_V4, 1); + flow(WAIT0); + flow(END); }); } -TEST_F(InsertFlow, Clper) { +TEST_F(InsertFlow, Clper) +{ CASE(FRAGMENT, { - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP4); - flow(DISCARD); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(END); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), + BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, + BI_SUBGROUP_SUBGROUP4); + flow(DISCARD); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(END); }); } -TEST_F(InsertFlow, TextureImplicit) { +TEST_F(InsertFlow, TextureImplicit) +{ CASE(FRAGMENT, { - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - bi_tex_single_to(b, bi_register(0), bi_register(4), bi_register(8), - bi_register(12), false, BI_DIMENSION_2D, - BI_REGISTER_FORMAT_F32, false, false, - BI_VA_LOD_MODE_COMPUTED_LOD, BI_WRITE_MASK_RGBA, 4); - flow(DISCARD); - flow(WAIT0); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(END); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + bi_tex_single_to(b, bi_register(0), bi_register(4), bi_register(8), + bi_register(12), false, BI_DIMENSION_2D, + BI_REGISTER_FORMAT_F32, false, false, + BI_VA_LOD_MODE_COMPUTED_LOD, BI_WRITE_MASK_RGBA, 4); + flow(DISCARD); + flow(WAIT0); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(END); }); } -TEST_F(InsertFlow, TextureExplicit) { +TEST_F(InsertFlow, TextureExplicit) +{ CASE(FRAGMENT, { - flow(DISCARD); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - bi_tex_single_to(b, bi_register(0), bi_register(4), bi_register(8), - bi_register(12), false, BI_DIMENSION_2D, - BI_REGISTER_FORMAT_F32, false, false, - BI_VA_LOD_MODE_ZERO_LOD, BI_WRITE_MASK_RGBA, 4); - flow(WAIT0); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(END); + flow(DISCARD); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + bi_tex_single_to(b, bi_register(0), bi_register(4), bi_register(8), + bi_register(12), false, BI_DIMENSION_2D, + BI_REGISTER_FORMAT_F32, false, false, + BI_VA_LOD_MODE_ZERO_LOD, BI_WRITE_MASK_RGBA, 4); + flow(WAIT0); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(END); }); } @@ -217,49 +230,52 @@ TEST_F(InsertFlow, TextureExplicit) { * \ / * D */ -TEST_F(InsertFlow, DiamondCFG) { +TEST_F(InsertFlow, DiamondCFG) +{ CASE(FRAGMENT, { - bi_block *A = bi_start_block(&b->shader->blocks); - bi_block *B = bit_block(b->shader); - bi_block *C = bit_block(b->shader); - bi_block *D = bit_block(b->shader); + bi_block *A = bi_start_block(&b->shader->blocks); + bi_block *B = bit_block(b->shader); + bi_block *C = bit_block(b->shader); + bi_block *D = bit_block(b->shader); - bi_block_add_successor(A, B); - bi_block_add_successor(A, C); + bi_block_add_successor(A, B); + bi_block_add_successor(A, C); - bi_block_add_successor(B, D); - bi_block_add_successor(C, D); + bi_block_add_successor(B, D); + bi_block_add_successor(C, D); - /* B uses helper invocations, no other block does. - * - * That means B and C need to discard helpers. - */ - b->cursor = bi_after_block(B); - bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP4); - flow(DISCARD); - flow(RECONVERGE); + /* B uses helper invocations, no other block does. + * + * That means B and C need to discard helpers. + */ + b->cursor = bi_after_block(B); + bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), + BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, + BI_SUBGROUP_SUBGROUP4); + flow(DISCARD); + flow(RECONVERGE); - b->cursor = bi_after_block(C); - flow(DISCARD); - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(RECONVERGE); + b->cursor = bi_after_block(C); + flow(DISCARD); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(RECONVERGE); - b->cursor = bi_after_block(D); - flow(END); + b->cursor = bi_after_block(D); + flow(END); }); } -TEST_F(InsertFlow, BarrierBug) { +TEST_F(InsertFlow, BarrierBug) +{ CASE(KERNEL, { - bi_instr *I = bi_store_i32(b, bi_register(0), bi_register(2), bi_register(4), BI_SEG_NONE, 0); - I->slot = 2; + bi_instr *I = bi_store_i32(b, bi_register(0), bi_register(2), + bi_register(4), BI_SEG_NONE, 0); + I->slot = 2; - bi_fadd_f32_to(b, bi_register(10), bi_register(10), bi_register(10)); - flow(WAIT2); - bi_barrier(b); - flow(WAIT); - flow(END); + bi_fadd_f32_to(b, bi_register(10), bi_register(10), bi_register(10)); + flow(WAIT2); + bi_barrier(b); + flow(WAIT); + flow(END); }); } diff --git a/src/panfrost/bifrost/valhall/test/test-lower-constants.cpp b/src/panfrost/bifrost/valhall/test/test-lower-constants.cpp index 2d98a8fab82..d58805392fd 100644 --- a/src/panfrost/bifrost/valhall/test/test-lower-constants.cpp +++ b/src/panfrost/bifrost/valhall/test/test-lower-constants.cpp @@ -21,9 +21,9 @@ * SOFTWARE. */ -#include "va_compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "va_compiler.h" #include @@ -38,19 +38,22 @@ add_imm(bi_context *ctx) #define CASE(instr, expected) INSTRUCTION_CASE(instr, expected, add_imm) class LowerConstants : public testing::Test { -protected: - LowerConstants() { + protected: + LowerConstants() + { mem_ctx = ralloc_context(NULL); } - ~LowerConstants() { + ~LowerConstants() + { ralloc_free(mem_ctx); } void *mem_ctx; }; -TEST_F(LowerConstants, Float32) { +TEST_F(LowerConstants, Float32) +{ CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_imm_f32(0.0)), bi_fadd_f32_to(b, bi_register(0), bi_register(0), va_lut(0))); @@ -61,46 +64,59 @@ TEST_F(LowerConstants, Float32) { bi_fadd_f32_to(b, bi_register(0), bi_register(0), va_lut(17))); } -TEST_F(LowerConstants, WidenFloat16) { +TEST_F(LowerConstants, WidenFloat16) +{ CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_imm_f32(0.5)), - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_half(va_lut(26), 1))); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), + bi_half(va_lut(26), 1))); CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_imm_f32(255.0)), - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_half(va_lut(23), 0))); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), + bi_half(va_lut(23), 0))); CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_imm_f32(256.0)), - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_half(va_lut(23), 1))); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), + bi_half(va_lut(23), 1))); CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_imm_f32(8.0)), - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_half(va_lut(30), 1))); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), + bi_half(va_lut(30), 1))); } -TEST_F(LowerConstants, ReplicateFloat16) { +TEST_F(LowerConstants, ReplicateFloat16) +{ CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_imm_f16(255.0)), - bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_half(va_lut(23), 0))); + bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), + bi_half(va_lut(23), 0))); CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_imm_f16(4.0)), - bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_half(va_lut(29), 1))); + bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), + bi_half(va_lut(29), 1))); } -TEST_F(LowerConstants, NegateFloat32) { +TEST_F(LowerConstants, NegateFloat32) +{ CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_imm_f32(-1.0)), bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_neg(va_lut(16)))); CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_imm_f32(-255.0)), - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_neg(bi_half(va_lut(23), 0)))); + bi_fadd_f32_to(b, bi_register(0), bi_register(0), + bi_neg(bi_half(va_lut(23), 0)))); } TEST_F(LowerConstants, NegateReplicateFloat16) { CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_imm_f16(-255.0)), - bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_neg(bi_half(va_lut(23), 0)))); + bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), + bi_neg(bi_half(va_lut(23), 0)))); } TEST_F(LowerConstants, NegateVec2Float16) { - CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_imm_u32(0xBC008000)), - bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_neg(va_lut(27)))); + CASE( + bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), + bi_imm_u32(0xBC008000)), + bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_neg(va_lut(27)))); } TEST_F(LowerConstants, Int8InInt32) @@ -117,87 +133,105 @@ TEST_F(LowerConstants, ZeroExtendForUnsigned) CASE(bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), bi_imm_u32(0xFF), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), - bi_byte(va_lut(1), 0), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); + bi_byte(va_lut(1), 0), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1)); - CASE(bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), - bi_imm_u32(0xFFFF), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), - bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), - bi_half(va_lut(1), 0), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); + CASE( + bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), bi_imm_u32(0xFFFF), + bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), + bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), + bi_half(va_lut(1), 0), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1)); } TEST_F(LowerConstants, SignExtendPositiveForSigned) { - CASE(bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_imm_u32(0x7F), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), + CASE(bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), bi_imm_u32(0x7F), + bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_byte(va_lut(2), 3), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); + bi_byte(va_lut(2), 3), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1)); - CASE(bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_imm_u32(0x7FFF), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), - bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_half(va_lut(2), 1), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); + CASE( + bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), bi_imm_u32(0x7FFF), + bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), + bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), + bi_half(va_lut(2), 1), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1)); } TEST_F(LowerConstants, SignExtendNegativeForSigned) { CASE(bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_imm_u32(0xFFFFFFF8), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), + bi_imm_u32(0xFFFFFFF8), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1), bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_byte(va_lut(23), 0), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); + bi_byte(va_lut(23), 0), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1)); CASE(bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_imm_u32(0xFFFFFAFC), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), + bi_imm_u32(0xFFFFFAFC), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1), bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_half(va_lut(3), 1), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); + bi_half(va_lut(3), 1), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1)); } TEST_F(LowerConstants, DontZeroExtendForSigned) { - CASE(bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_imm_u32(0xFF), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), + CASE(bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), bi_imm_u32(0xFF), + bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_iadd_imm_i32(b, va_lut(0), 0xFF), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); - - CASE(bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_imm_u32(0xFFFF), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), - bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), - bi_iadd_imm_i32(b, va_lut(0), 0xFFFF), bi_register(0), + bi_iadd_imm_i32(b, va_lut(0), 0xFF), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); + + CASE( + bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), bi_imm_u32(0xFFFF), + bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), + bi_icmp_and_s32_to(b, bi_register(0), bi_register(0), + bi_iadd_imm_i32(b, va_lut(0), 0xFFFF), bi_register(0), + BI_CMPF_LT, BI_RESULT_TYPE_I1)); } TEST_F(LowerConstants, DontZeroExtendNegative) { CASE(bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), - bi_imm_u32(0xFFFFFFF8), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), + bi_imm_u32(0xFFFFFFF8), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1), bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), - bi_iadd_imm_i32(b, va_lut(0), 0xFFFFFFF8), bi_register(0), - BI_CMPF_LT, BI_RESULT_TYPE_I1)); + bi_iadd_imm_i32(b, va_lut(0), 0xFFFFFFF8), + bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); CASE(bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), - bi_imm_u32(0xFFFFFAFC), bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1), + bi_imm_u32(0xFFFFFAFC), bi_register(0), BI_CMPF_LT, + BI_RESULT_TYPE_I1), bi_icmp_and_u32_to(b, bi_register(0), bi_register(0), - bi_iadd_imm_i32(b, va_lut(0), 0xFFFFFAFC), bi_register(0), - BI_CMPF_LT, BI_RESULT_TYPE_I1)); + bi_iadd_imm_i32(b, va_lut(0), 0xFFFFFAFC), + bi_register(0), BI_CMPF_LT, BI_RESULT_TYPE_I1)); } TEST_F(LowerConstants, HandleTrickyNegativesFP16) { - CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_imm_f16(-57216.0)), - bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_half(va_lut(3), 1))); + CASE( + bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_imm_f16(-57216.0)), + bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), + bi_half(va_lut(3), 1))); - CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_imm_f16(57216.0)), - bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_neg(bi_half(va_lut(3), 1)))); + CASE( + bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), bi_imm_f16(57216.0)), + bi_fadd_v2f16_to(b, bi_register(0), bi_register(0), + bi_neg(bi_half(va_lut(3), 1)))); } TEST_F(LowerConstants, MaintainMkvecRestrictedSwizzles) { - CASE(bi_mkvec_v2i8_to(b, bi_register(0), bi_register(0), - bi_imm_u8(0), bi_imm_u32(0)), + CASE(bi_mkvec_v2i8_to(b, bi_register(0), bi_register(0), bi_imm_u8(0), + bi_imm_u32(0)), bi_mkvec_v2i8_to(b, bi_register(0), bi_register(0), bi_byte(va_lut(0), 0), va_lut(0))); - CASE(bi_mkvec_v2i8_to(b, bi_register(0), bi_register(0), - bi_imm_u8(14), bi_imm_u32(0)), + CASE(bi_mkvec_v2i8_to(b, bi_register(0), bi_register(0), bi_imm_u8(14), + bi_imm_u32(0)), bi_mkvec_v2i8_to(b, bi_register(0), bi_register(0), bi_byte(va_lut(11), 2), va_lut(0))); } diff --git a/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp b/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp index 994885b66db..df1947be323 100644 --- a/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp +++ b/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp @@ -21,18 +21,19 @@ * SOFTWARE. */ -#include "va_compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "va_compiler.h" #include #define CASE(instr, expected) INSTRUCTION_CASE(instr, expected, va_lower_isel) -#define NEGCASE(instr) CASE(instr, instr) +#define NEGCASE(instr) CASE(instr, instr) class LowerIsel : public testing::Test { -protected: - LowerIsel() { + protected: + LowerIsel() + { mem_ctx = ralloc_context(NULL); reg = bi_register(1); x = bi_register(2); @@ -40,7 +41,8 @@ protected: z = bi_register(4); } - ~LowerIsel() { + ~LowerIsel() + { ralloc_free(mem_ctx); } @@ -48,14 +50,16 @@ protected: bi_index reg, x, y, z; }; -TEST_F(LowerIsel, 8BitSwizzles) { +TEST_F(LowerIsel, 8BitSwizzles) +{ for (unsigned i = 0; i < 4; ++i) { CASE(bi_swz_v4i8_to(b, reg, bi_byte(reg, i)), bi_iadd_v4u8_to(b, reg, bi_byte(reg, i), bi_zero(), false)); } } -TEST_F(LowerIsel, 16BitSwizzles) { +TEST_F(LowerIsel, 16BitSwizzles) +{ for (unsigned i = 0; i < 2; ++i) { for (unsigned j = 0; j < 2; ++j) { CASE(bi_swz_v2i16_to(b, reg, bi_swz_16(reg, i, j)), @@ -64,24 +68,30 @@ TEST_F(LowerIsel, 16BitSwizzles) { } } -TEST_F(LowerIsel, JumpsLoweredToBranches) { - bi_block block = { }; +TEST_F(LowerIsel, JumpsLoweredToBranches) +{ + bi_block block = {}; - CASE({ - bi_instr *I = bi_jump(b, bi_imm_u32(0xDEADBEEF)); - I->branch_target = █ - }, { - bi_instr *I = bi_branchz_i16(b, bi_zero(), bi_imm_u32(0xDEADBEEF), BI_CMPF_EQ); - I->branch_target = █ - }); + CASE( + { + bi_instr *I = bi_jump(b, bi_imm_u32(0xDEADBEEF)); + I->branch_target = █ + }, + { + bi_instr *I = + bi_branchz_i16(b, bi_zero(), bi_imm_u32(0xDEADBEEF), BI_CMPF_EQ); + I->branch_target = █ + }); } -TEST_F(LowerIsel, IndirectJumpsLoweredToBranches) { +TEST_F(LowerIsel, IndirectJumpsLoweredToBranches) +{ CASE(bi_jump(b, bi_register(17)), bi_branchzi(b, bi_zero(), bi_register(17), BI_CMPF_EQ)); } -TEST_F(LowerIsel, IntegerCSEL) { +TEST_F(LowerIsel, IntegerCSEL) +{ CASE(bi_csel_i32(b, reg, reg, reg, reg, BI_CMPF_EQ), bi_csel_u32(b, reg, reg, reg, reg, BI_CMPF_EQ)); @@ -89,7 +99,8 @@ TEST_F(LowerIsel, IntegerCSEL) { bi_csel_v2u16(b, reg, reg, reg, reg, BI_CMPF_EQ)); } -TEST_F(LowerIsel, AvoidSimpleMux) { +TEST_F(LowerIsel, AvoidSimpleMux) +{ CASE(bi_mux_i32(b, x, y, z, BI_MUX_INT_ZERO), bi_csel_u32(b, z, bi_zero(), x, y, BI_CMPF_EQ)); CASE(bi_mux_i32(b, x, y, z, BI_MUX_NEG), @@ -105,27 +116,32 @@ TEST_F(LowerIsel, AvoidSimpleMux) { bi_csel_v2f16(b, z, bi_zero(), x, y, BI_CMPF_EQ)); } -TEST_F(LowerIsel, BitwiseMux) { +TEST_F(LowerIsel, BitwiseMux) +{ NEGCASE(bi_mux_i32(b, x, y, z, BI_MUX_BIT)); NEGCASE(bi_mux_v2i16(b, x, y, z, BI_MUX_BIT)); NEGCASE(bi_mux_v4i8(b, x, y, z, BI_MUX_BIT)); } -TEST_F(LowerIsel, MuxInt8) { +TEST_F(LowerIsel, MuxInt8) +{ NEGCASE(bi_mux_v4i8(b, x, y, z, BI_MUX_INT_ZERO)); NEGCASE(bi_mux_v4i8(b, x, y, z, BI_MUX_NEG)); NEGCASE(bi_mux_v4i8(b, x, y, z, BI_MUX_FP_ZERO)); } -TEST_F(LowerIsel, FaddRscale) { - CASE(bi_fadd_rscale_f32_to(b, reg, x, y, z, BI_SPECIAL_NONE), - bi_fma_rscale_f32_to(b, reg, x, bi_imm_f32(1.0), y, z, BI_SPECIAL_NONE)); +TEST_F(LowerIsel, FaddRscale) +{ + CASE( + bi_fadd_rscale_f32_to(b, reg, x, y, z, BI_SPECIAL_NONE), + bi_fma_rscale_f32_to(b, reg, x, bi_imm_f32(1.0), y, z, BI_SPECIAL_NONE)); CASE(bi_fadd_rscale_f32_to(b, reg, x, y, z, BI_SPECIAL_N), bi_fma_rscale_f32_to(b, reg, x, bi_imm_f32(1.0), y, z, BI_SPECIAL_N)); } -TEST_F(LowerIsel, Smoke) { +TEST_F(LowerIsel, Smoke) +{ NEGCASE(bi_fadd_f32_to(b, reg, reg, reg)); NEGCASE(bi_csel_s32_to(b, reg, reg, reg, reg, reg, BI_CMPF_LT)); NEGCASE(bi_csel_u32_to(b, reg, reg, reg, reg, reg, BI_CMPF_LT)); diff --git a/src/panfrost/bifrost/valhall/test/test-mark-last.cpp b/src/panfrost/bifrost/valhall/test/test-mark-last.cpp index f79b9a73855..779a13c2b65 100644 --- a/src/panfrost/bifrost/valhall/test/test-mark-last.cpp +++ b/src/panfrost/bifrost/valhall/test/test-mark-last.cpp @@ -21,14 +21,14 @@ * SOFTWARE. */ -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" #include "va_compiler.h" #include "valhall_enums.h" #include -#define R(x) bi_register(x) +#define R(x) bi_register(x) #define DR(x) bi_discard(R(x)) static void @@ -40,105 +40,119 @@ strip_discard(bi_context *ctx) } } -#define CASE(test) do { \ - void *mem_ctx = ralloc_context(NULL); \ - bi_builder *A = bit_builder(mem_ctx); \ - bi_builder *B = bit_builder(mem_ctx); \ - { \ - UNUSED bi_builder *b = A; \ - test; \ - } \ - strip_discard(A->shader); \ - va_mark_last(A->shader); \ - { \ - UNUSED bi_builder *b = B; \ - test; \ - } \ - ASSERT_SHADER_EQUAL(A->shader, B->shader); \ - ralloc_free(mem_ctx); \ -} while(0) +#define CASE(test) \ + do { \ + void *mem_ctx = ralloc_context(NULL); \ + bi_builder *A = bit_builder(mem_ctx); \ + bi_builder *B = bit_builder(mem_ctx); \ + { \ + UNUSED bi_builder *b = A; \ + test; \ + } \ + strip_discard(A->shader); \ + va_mark_last(A->shader); \ + { \ + UNUSED bi_builder *b = B; \ + test; \ + } \ + ASSERT_SHADER_EQUAL(A->shader, B->shader); \ + ralloc_free(mem_ctx); \ + } while (0) -TEST(MarkLast, Simple) { +TEST(MarkLast, Simple) +{ CASE(bi_fadd_f32_to(b, R(0), DR(0), DR(1))); CASE({ - bi_fadd_f32_to(b, R(2), R(0), DR(1)); - bi_fadd_f32_to(b, R(0), DR(0), DR(2)); + bi_fadd_f32_to(b, R(2), R(0), DR(1)); + bi_fadd_f32_to(b, R(0), DR(0), DR(2)); }); } -TEST(MarkLast, SameSourceAndDestination) { +TEST(MarkLast, SameSourceAndDestination) +{ CASE({ - bi_fadd_f32_to(b, R(0), DR(0), DR(0)); - bi_fadd_f32_to(b, R(0), DR(0), DR(0)); - bi_fadd_f32_to(b, R(0), DR(0), DR(0)); + bi_fadd_f32_to(b, R(0), DR(0), DR(0)); + bi_fadd_f32_to(b, R(0), DR(0), DR(0)); + bi_fadd_f32_to(b, R(0), DR(0), DR(0)); }); } -TEST(MarkLast, StagingReadBefore) { +TEST(MarkLast, StagingReadBefore) +{ CASE({ - bi_fadd_f32_to(b, R(9), R(2), DR(7)); - bi_st_tile(b, R(0), DR(4), DR(5), DR(6), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4); + bi_fadd_f32_to(b, R(9), R(2), DR(7)); + bi_st_tile(b, R(0), DR(4), DR(5), DR(6), BI_REGISTER_FORMAT_F32, + BI_VECSIZE_V4); }); } -TEST(MarkLast, StagingReadAfter) { +TEST(MarkLast, StagingReadAfter) +{ CASE({ - bi_st_tile(b, R(0), DR(4), DR(5), DR(6), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4); - bi_fadd_f32_to(b, R(9), R(2), DR(7)); + bi_st_tile(b, R(0), DR(4), DR(5), DR(6), BI_REGISTER_FORMAT_F32, + BI_VECSIZE_V4); + bi_fadd_f32_to(b, R(9), R(2), DR(7)); }); } -TEST(MarkLast, NonstagingSourceToAsync) { +TEST(MarkLast, NonstagingSourceToAsync) +{ CASE({ - bi_st_tile(b, R(0), R(4), R(5), DR(6), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4); - bi_fadd_f32_to(b, R(9), DR(4), DR(5)); + bi_st_tile(b, R(0), R(4), R(5), DR(6), BI_REGISTER_FORMAT_F32, + BI_VECSIZE_V4); + bi_fadd_f32_to(b, R(9), DR(4), DR(5)); }); } -TEST(MarkLast, Both64) { +TEST(MarkLast, Both64) +{ CASE(bi_load_i32_to(b, R(0), DR(8), DR(9), BI_SEG_NONE, 0)); } -TEST(MarkLast, Neither64ThenBoth) { +TEST(MarkLast, Neither64ThenBoth) +{ CASE({ - bi_load_i32_to(b, R(0), R(8), R(9), BI_SEG_NONE, 0); - bi_load_i32_to(b, R(1), DR(8), DR(9), BI_SEG_NONE, 8); + bi_load_i32_to(b, R(0), R(8), R(9), BI_SEG_NONE, 0); + bi_load_i32_to(b, R(1), DR(8), DR(9), BI_SEG_NONE, 8); }); } -TEST(MarkLast, Half64) { +TEST(MarkLast, Half64) +{ CASE({ - bi_load_i32_to(b, R(0), R(8), R(9), BI_SEG_NONE, 0); - bi_fadd_f32_to(b, R(8), DR(8), DR(8)); + bi_load_i32_to(b, R(0), R(8), R(9), BI_SEG_NONE, 0); + bi_fadd_f32_to(b, R(8), DR(8), DR(8)); }); CASE({ - bi_load_i32_to(b, R(0), R(8), R(9), BI_SEG_NONE, 0); - bi_fadd_f32_to(b, R(9), DR(9), DR(9)); + bi_load_i32_to(b, R(0), R(8), R(9), BI_SEG_NONE, 0); + bi_fadd_f32_to(b, R(9), DR(9), DR(9)); }); } -TEST(MarkLast, RegisterBlendDescriptor) { +TEST(MarkLast, RegisterBlendDescriptor) +{ CASE({ - bi_blend_to(b, R(48), R(0), DR(60), DR(4), DR(5), bi_null(), - BI_REGISTER_FORMAT_F32, 4, 0); + bi_blend_to(b, R(48), R(0), DR(60), DR(4), DR(5), bi_null(), + BI_REGISTER_FORMAT_F32, 4, 0); }); CASE({ - bi_blend_to(b, R(48), R(0), DR(60), R(4), R(5), bi_null(), - BI_REGISTER_FORMAT_F32, 4, 0); - bi_fadd_f32_to(b, R(4), DR(4), DR(7)); + bi_blend_to(b, R(48), R(0), DR(60), R(4), R(5), bi_null(), + BI_REGISTER_FORMAT_F32, 4, 0); + bi_fadd_f32_to(b, R(4), DR(4), DR(7)); }); CASE({ - bi_blend_to(b, R(48), R(0), DR(60), R(4), R(5), bi_null(), - BI_REGISTER_FORMAT_F32, 4, 0); - bi_fadd_f32_to(b, R(4), DR(5), DR(7)); + bi_blend_to(b, R(48), R(0), DR(60), R(4), R(5), bi_null(), + BI_REGISTER_FORMAT_F32, 4, 0); + bi_fadd_f32_to(b, R(4), DR(5), DR(7)); }); } -TEST(MarkLast, ControlFlowAllFeatures) { +TEST(MarkLast, ControlFlowAllFeatures) +{ /* A * / \ * B C @@ -153,9 +167,8 @@ TEST(MarkLast, ControlFlowAllFeatures) { b->cursor = bi_after_block(A); { - bi_instr *I = - bi_st_tile(b, R(10), DR(14), DR(15), DR(16), - BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4); + bi_instr *I = bi_st_tile(b, R(10), DR(14), DR(15), DR(16), + BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4); I->slot = 2; bi_load_i32_to(b, R(20), R(28), R(29), BI_SEG_NONE, 0); diff --git a/src/panfrost/bifrost/valhall/test/test-merge-flow.cpp b/src/panfrost/bifrost/valhall/test/test-merge-flow.cpp index a02600bb31d..36e8c1c5064 100644 --- a/src/panfrost/bifrost/valhall/test/test-merge-flow.cpp +++ b/src/panfrost/bifrost/valhall/test/test-merge-flow.cpp @@ -21,42 +21,45 @@ * SOFTWARE. */ -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" #include "va_compiler.h" #include "valhall_enums.h" #include -#define CASE(test, expected) do { \ - bi_builder *A = bit_builder(mem_ctx); \ - bi_builder *B = bit_builder(mem_ctx); \ - { \ - bi_builder *b = A; \ - A->shader->stage = MESA_SHADER_FRAGMENT; \ - test; \ - } \ - va_merge_flow(A->shader); \ - { \ - bi_builder *b = B; \ - B->shader->stage = MESA_SHADER_FRAGMENT; \ - expected; \ - } \ - ASSERT_SHADER_EQUAL(A->shader, B->shader); \ -} while(0) +#define CASE(test, expected) \ + do { \ + bi_builder *A = bit_builder(mem_ctx); \ + bi_builder *B = bit_builder(mem_ctx); \ + { \ + bi_builder *b = A; \ + A->shader->stage = MESA_SHADER_FRAGMENT; \ + test; \ + } \ + va_merge_flow(A->shader); \ + { \ + bi_builder *b = B; \ + B->shader->stage = MESA_SHADER_FRAGMENT; \ + expected; \ + } \ + ASSERT_SHADER_EQUAL(A->shader, B->shader); \ + } while (0) #define NEGCASE(test) CASE(test, test) -#define flow(f) bi_nop(b)->flow = VA_FLOW_ ## f +#define flow(f) bi_nop(b)->flow = VA_FLOW_##f class MergeFlow : public testing::Test { -protected: - MergeFlow() { + protected: + MergeFlow() + { mem_ctx = ralloc_context(NULL); atest = bi_fau(BIR_FAU_ATEST_PARAM, false); } - ~MergeFlow() { + ~MergeFlow() + { ralloc_free(mem_ctx); } @@ -65,74 +68,84 @@ protected: bi_index atest; }; -TEST_F(MergeFlow, End) { - CASE({ - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), - bi_register(6), bi_register(7), bi_register(8), - BI_REGISTER_FORMAT_AUTO, 4, 4); - flow(END); - }, - { - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - I = bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), - bi_register(6), bi_register(7), bi_register(8), - BI_REGISTER_FORMAT_AUTO, 4, 4); - I->flow = VA_FLOW_END; - }); +TEST_F(MergeFlow, End) +{ + CASE( + { + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), + bi_register(6), bi_register(7), bi_register(8), + BI_REGISTER_FORMAT_AUTO, 4, 4); + flow(END); + }, + { + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + I = bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), + bi_register(6), bi_register(7), bi_register(8), + BI_REGISTER_FORMAT_AUTO, 4, 4); + I->flow = VA_FLOW_END; + }); } -TEST_F(MergeFlow, Reconverge) { - CASE({ - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), - bi_register(6), bi_register(7), bi_register(8), - BI_REGISTER_FORMAT_AUTO, 4, 4); - flow(RECONVERGE); - }, - { - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - I = bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), - bi_register(6), bi_register(7), bi_register(8), - BI_REGISTER_FORMAT_AUTO, 4, 4); - I->flow = VA_FLOW_RECONVERGE; - }); +TEST_F(MergeFlow, Reconverge) +{ + CASE( + { + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), + bi_register(6), bi_register(7), bi_register(8), + BI_REGISTER_FORMAT_AUTO, 4, 4); + flow(RECONVERGE); + }, + { + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + I = bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), + bi_register(6), bi_register(7), bi_register(8), + BI_REGISTER_FORMAT_AUTO, 4, 4); + I->flow = VA_FLOW_RECONVERGE; + }); } -TEST_F(MergeFlow, TrivialWait) { - CASE({ - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - flow(WAIT0126); - bi_atest_to(b, bi_register(0), bi_register(4), bi_register(5), atest); - }, - { - I = bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - I->flow = VA_FLOW_WAIT0126; - bi_atest_to(b, bi_register(0), bi_register(4), bi_register(5), atest); - }); +TEST_F(MergeFlow, TrivialWait) +{ + CASE( + { + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + flow(WAIT0126); + bi_atest_to(b, bi_register(0), bi_register(4), bi_register(5), atest); + }, + { + I = bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + I->flow = VA_FLOW_WAIT0126; + bi_atest_to(b, bi_register(0), bi_register(4), bi_register(5), atest); + }); } -TEST_F(MergeFlow, LoadThenUnrelatedThenUse) { - CASE({ +TEST_F(MergeFlow, LoadThenUnrelatedThenUse) +{ + CASE( + { bi_ld_attr_imm_to(b, bi_register(16), bi_register(60), bi_register(61), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 1); bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); flow(WAIT0); bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(19)); flow(END); - }, - { + }, + { bi_ld_attr_imm_to(b, bi_register(16), bi_register(60), bi_register(61), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 1); I = bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I->flow = VA_FLOW_WAIT0; I = bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(19)); I->flow = VA_FLOW_END; - }); + }); } -TEST_F(MergeFlow, TrivialDiscard) { - CASE({ +TEST_F(MergeFlow, TrivialDiscard) +{ + CASE( + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, @@ -140,31 +153,35 @@ TEST_F(MergeFlow, TrivialDiscard) { flow(DISCARD); bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); flow(END); - }, - { + }, + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I = bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP4); + BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, + BI_SUBGROUP_SUBGROUP4); I->flow = VA_FLOW_DISCARD; I = bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I->flow = VA_FLOW_END; - }); + }); } -TEST_F(MergeFlow, TrivialDiscardAtTheStart) { - CASE({ +TEST_F(MergeFlow, TrivialDiscardAtTheStart) +{ + CASE( + { flow(DISCARD); bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - }, - { + }, + { I = bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I->flow = VA_FLOW_DISCARD; - }); + }); } -TEST_F(MergeFlow, MoveDiscardPastWait) { - CASE({ +TEST_F(MergeFlow, MoveDiscardPastWait) +{ + CASE( + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, @@ -172,20 +189,22 @@ TEST_F(MergeFlow, MoveDiscardPastWait) { flow(DISCARD); flow(WAIT0); bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - }, - { + }, + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I = bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP4); + BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, + BI_SUBGROUP_SUBGROUP4); I->flow = VA_FLOW_WAIT0; I = bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I->flow = VA_FLOW_DISCARD; - }); + }); } -TEST_F(MergeFlow, OccludedWaitsAndDiscard) { - CASE({ +TEST_F(MergeFlow, OccludedWaitsAndDiscard) +{ + CASE( + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, @@ -194,75 +213,84 @@ TEST_F(MergeFlow, OccludedWaitsAndDiscard) { flow(DISCARD); flow(WAIT2); bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - }, - { + }, + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I = bi_clper_i32_to(b, bi_register(0), bi_register(4), bi_register(8), - BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, - BI_SUBGROUP_SUBGROUP4); + BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, + BI_SUBGROUP_SUBGROUP4); I->flow = VA_FLOW_WAIT02; I = bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I->flow = VA_FLOW_DISCARD; - }); + }); } -TEST_F(MergeFlow, DeleteUselessWaits) { - CASE({ +TEST_F(MergeFlow, DeleteUselessWaits) +{ + CASE( + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); flow(WAIT0); flow(WAIT2); flow(END); - }, - { + }, + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I = bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); I->flow = VA_FLOW_END; - }); + }); } -TEST_F(MergeFlow, BlockFullOfUselessWaits) { - CASE({ +TEST_F(MergeFlow, BlockFullOfUselessWaits) +{ + CASE( + { flow(WAIT0); flow(WAIT2); flow(DISCARD); flow(END); - }, - { - flow(END); - }); + }, + { flow(END); }); } -TEST_F(MergeFlow, WaitWithMessage) { - CASE({ +TEST_F(MergeFlow, WaitWithMessage) +{ + CASE( + { bi_ld_attr_imm_to(b, bi_register(16), bi_register(60), bi_register(61), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 1); flow(WAIT0); - }, - { - I = bi_ld_attr_imm_to(b, bi_register(16), bi_register(60), bi_register(61), - BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 1); + }, + { + I = bi_ld_attr_imm_to(b, bi_register(16), bi_register(60), + bi_register(61), BI_REGISTER_FORMAT_F32, + BI_VECSIZE_V4, 1); I->flow = VA_FLOW_WAIT0; - }); + }); } -TEST_F(MergeFlow, CantMoveWaitPastMessage) { +TEST_F(MergeFlow, CantMoveWaitPastMessage) +{ NEGCASE({ - bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - I = bi_ld_attr_imm_to(b, bi_register(16), bi_register(60), bi_register(61), + bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); + I = + bi_ld_attr_imm_to(b, bi_register(16), bi_register(60), bi_register(61), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 1); - /* Pretend it's blocked for some reason. This doesn't actually happen - * with the current algorithm, but it's good to handle the special - * cases correctly in case we change later on. - */ - I->flow = VA_FLOW_DISCARD; - flow(WAIT0); + /* Pretend it's blocked for some reason. This doesn't actually happen + * with the current algorithm, but it's good to handle the special + * cases correctly in case we change later on. + */ + I->flow = VA_FLOW_DISCARD; + flow(WAIT0); }); } -TEST_F(MergeFlow, DeletePointlessDiscard) { - CASE({ +TEST_F(MergeFlow, DeletePointlessDiscard) +{ + CASE( + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); bi_tex_single_to(b, bi_register(0), bi_register(4), bi_register(8), bi_register(12), false, BI_DIMENSION_2D, @@ -277,31 +305,34 @@ TEST_F(MergeFlow, DeletePointlessDiscard) { bi_register(6), bi_register(7), bi_register(8), BI_REGISTER_FORMAT_AUTO, 4, 4); flow(END); - }, - { + }, + { bi_fadd_f32_to(b, bi_register(0), bi_register(0), bi_register(0)); - I = bi_tex_single_to(b, bi_register(0), bi_register(4), bi_register(8), - bi_register(12), false, BI_DIMENSION_2D, - BI_REGISTER_FORMAT_F32, false, false, - BI_VA_LOD_MODE_COMPUTED_LOD, BI_WRITE_MASK_RGBA, 4); + I = bi_tex_single_to( + b, bi_register(0), bi_register(4), bi_register(8), bi_register(12), + false, BI_DIMENSION_2D, BI_REGISTER_FORMAT_F32, false, false, + BI_VA_LOD_MODE_COMPUTED_LOD, BI_WRITE_MASK_RGBA, 4); I->flow = VA_FLOW_WAIT0126; - I = bi_atest_to(b, bi_register(0), bi_register(4), bi_register(5), atest); + I = bi_atest_to(b, bi_register(0), bi_register(4), bi_register(5), + atest); I->flow = VA_FLOW_WAIT; I = bi_blend_to(b, bi_register(0), bi_register(4), bi_register(5), bi_register(6), bi_register(7), bi_register(8), BI_REGISTER_FORMAT_AUTO, 4, 4); I->flow = VA_FLOW_END; - }); + }); } -TEST_F(MergeFlow, PreserveTerminalBarriers) { - CASE({ +TEST_F(MergeFlow, PreserveTerminalBarriers) +{ + CASE( + { bi_barrier(b); flow(WAIT); flow(END); - }, - { + }, + { bi_barrier(b)->flow = VA_FLOW_WAIT; flow(END); - }); + }); } diff --git a/src/panfrost/bifrost/valhall/test/test-packing.cpp b/src/panfrost/bifrost/valhall/test/test-packing.cpp index 5e8cd7c0f42..b7428497897 100644 --- a/src/panfrost/bifrost/valhall/test/test-packing.cpp +++ b/src/panfrost/bifrost/valhall/test/test-packing.cpp @@ -21,34 +21,38 @@ * SOFTWARE. */ -#include "va_compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "va_compiler.h" #include -#define CASE(instr, expected) do { \ - uint64_t _value = va_pack_instr(instr); \ - if (_value != expected) { \ - fprintf(stderr, "Got %" PRIx64 ", expected %" PRIx64 "\n", _value, (uint64_t) expected); \ - bi_print_instr(instr, stderr); \ - fprintf(stderr, "\n"); \ - ADD_FAILURE(); \ - } \ -} while(0) +#define CASE(instr, expected) \ + do { \ + uint64_t _value = va_pack_instr(instr); \ + if (_value != expected) { \ + fprintf(stderr, "Got %" PRIx64 ", expected %" PRIx64 "\n", _value, \ + (uint64_t)expected); \ + bi_print_instr(instr, stderr); \ + fprintf(stderr, "\n"); \ + ADD_FAILURE(); \ + } \ + } while (0) class ValhallPacking : public testing::Test { -protected: - ValhallPacking() { + protected: + ValhallPacking() + { mem_ctx = ralloc_context(NULL); b = bit_builder(mem_ctx); - zero = bi_fau((enum bir_fau) (BIR_FAU_IMMEDIATE | 0), false); - one = bi_fau((enum bir_fau) (BIR_FAU_IMMEDIATE | 8), false); - n4567 = bi_fau((enum bir_fau) (BIR_FAU_IMMEDIATE | 4), true); + zero = bi_fau((enum bir_fau)(BIR_FAU_IMMEDIATE | 0), false); + one = bi_fau((enum bir_fau)(BIR_FAU_IMMEDIATE | 8), false); + n4567 = bi_fau((enum bir_fau)(BIR_FAU_IMMEDIATE | 4), true); } - ~ValhallPacking() { + ~ValhallPacking() + { ralloc_free(mem_ctx); } @@ -57,60 +61,67 @@ protected: bi_index zero, one, n4567; }; -TEST_F(ValhallPacking, Moves) { +TEST_F(ValhallPacking, Moves) +{ CASE(bi_mov_i32_to(b, bi_register(1), bi_register(2)), - 0x0091c10000000002ULL); - CASE(bi_mov_i32_to(b, bi_register(1), bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 5), false)), - 0x0091c1000000008aULL); + 0x0091c10000000002ULL); + CASE(bi_mov_i32_to(b, bi_register(1), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false)), + 0x0091c1000000008aULL); } -TEST_F(ValhallPacking, Fadd) { +TEST_F(ValhallPacking, Fadd) +{ CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2)), - 0x00a4c00000000201ULL); - CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))), - 0x00a4c02000000201ULL); - CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))), - 0x00a4c01000000201ULL); + 0x00a4c00000000201ULL); + CASE( + bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))), + 0x00a4c02000000201ULL); + CASE( + bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))), + 0x00a4c01000000201ULL); - CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_swz_16(bi_register(1), false, false), + CASE(bi_fadd_v2f16_to(b, bi_register(0), + bi_swz_16(bi_register(1), false, false), bi_swz_16(bi_register(0), true, true)), - 0x00a5c0000c000001ULL); + 0x00a5c0000c000001ULL); CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0)), - 0x00a5c00028000001ULL); + 0x00a5c00028000001ULL); CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_swz_16(bi_register(0), true, false)), - 0x00a5c00024000001ULL); + 0x00a5c00024000001ULL); CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))), bi_neg(zero)), - 0x00a5c0902800c040ULL); + 0x00a5c0902800c040ULL); - CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), - zero), - 0x00a4c0000000c001ULL); + CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero), + 0x00a4c0000000c001ULL); - CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), - bi_neg(zero)), - 0x00a4c0100000c001ULL); + CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero)), + 0x00a4c0100000c001ULL); CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_half(bi_register(0), true)), - 0x00a4c00008000001ULL); + 0x00a4c00008000001ULL); CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_half(bi_register(0), false)), - 0x00a4c00004000001ULL); + 0x00a4c00004000001ULL); } -TEST_F(ValhallPacking, Clper) { +TEST_F(ValhallPacking, Clper) +{ CASE(bi_clper_i32_to(b, bi_register(0), bi_register(0), bi_byte(n4567, 0), - BI_INACTIVE_RESULT_F1, BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP16), - 0x00a0c030128fc900); + BI_INACTIVE_RESULT_F1, BI_LANE_OP_NONE, + BI_SUBGROUP_SUBGROUP16), + 0x00a0c030128fc900); } -TEST_F(ValhallPacking, Clamps) { +TEST_F(ValhallPacking, Clamps) +{ bi_instr *I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_abs(bi_register(2)))); CASE(I, 0x00a4c03000000201ULL); @@ -119,209 +130,243 @@ TEST_F(ValhallPacking, Clamps) { CASE(I, 0x00a4c03200000201ULL); } -TEST_F(ValhallPacking, Misc) { +TEST_F(ValhallPacking, Misc) +{ CASE(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 4), false), - bi_neg(zero)), - 0x00b2c10400c08841ULL); + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false), + bi_neg(zero)), + 0x00b2c10400c08841ULL); CASE(bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))), BI_ROUND_RTN), - 0x0090c240800d0042ULL); + 0x0090c240800d0042ULL); CASE(bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0), - BI_ROUND_RTN), - 0x00904000a00f0000ULL); + BI_ROUND_RTN), + 0x00904000a00f0000ULL); - CASE(bi_fround_v2f16_to(b, bi_half(bi_register(0), false), - bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN), - 0x00904000900f0001ULL); + CASE( + bi_fround_v2f16_to(b, bi_half(bi_register(0), false), + bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN), + 0x00904000900f0001ULL); } -TEST_F(ValhallPacking, FaddImm) { - CASE(bi_fadd_imm_f32_to(b, bi_register(2), bi_discard(bi_register(2)), 0x4847C6C0), - 0x0114C24847C6C042ULL); +TEST_F(ValhallPacking, FaddImm) +{ + CASE(bi_fadd_imm_f32_to(b, bi_register(2), bi_discard(bi_register(2)), + 0x4847C6C0), + 0x0114C24847C6C042ULL); - CASE(bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)), 0x70AC6784), - 0x0115C270AC678442ULL); + CASE(bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)), + 0x70AC6784), + 0x0115C270AC678442ULL); } -TEST_F(ValhallPacking, Comparions) { +TEST_F(ValhallPacking, Comparions) +{ CASE(bi_icmp_or_v2s16_to(b, bi_register(2), - bi_discard(bi_swz_16(bi_register(3), true, false)), - bi_discard(bi_swz_16(bi_register(2), true, false)), - zero, BI_CMPF_GT, BI_RESULT_TYPE_M1), + bi_discard(bi_swz_16(bi_register(3), true, false)), + bi_discard(bi_swz_16(bi_register(2), true, false)), + zero, BI_CMPF_GT, BI_RESULT_TYPE_M1), 0x00f9c21184c04243); CASE(bi_fcmp_or_v2f16_to(b, bi_register(2), - bi_discard(bi_swz_16(bi_register(3), true, false)), - bi_discard(bi_swz_16(bi_register(2), false, false)), - zero, BI_CMPF_GT, BI_RESULT_TYPE_M1), - 0x00f5c20190c04243); + bi_discard(bi_swz_16(bi_register(3), true, false)), + bi_discard(bi_swz_16(bi_register(2), false, false)), + zero, BI_CMPF_GT, BI_RESULT_TYPE_M1), + 0x00f5c20190c04243); } -TEST_F(ValhallPacking, Conversions) { +TEST_F(ValhallPacking, Conversions) +{ CASE(bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2))), - 0x0090c22000070042); + 0x0090c22000070042); } -TEST_F(ValhallPacking, BranchzI16) { - bi_instr *I = bi_branchz_i16(b, bi_half(bi_register(2), false), bi_null(), BI_CMPF_EQ); +TEST_F(ValhallPacking, BranchzI16) +{ + bi_instr *I = + bi_branchz_i16(b, bi_half(bi_register(2), false), bi_null(), BI_CMPF_EQ); I->branch_offset = 1; CASE(I, 0x001fc03000000102); } -TEST_F(ValhallPacking, BranchzI16Backwards) { +TEST_F(ValhallPacking, BranchzI16Backwards) +{ bi_instr *I = bi_branchz_i16(b, zero, bi_null(), BI_CMPF_EQ); I->branch_offset = -8; CASE(I, 0x001fc017fffff8c0); } -TEST_F(ValhallPacking, Blend) { - CASE(bi_blend_to(b, bi_null(), bi_register(0), bi_register(60), - bi_fau(BIR_FAU_BLEND_0, false), - bi_fau(BIR_FAU_BLEND_0, true), - bi_null(), BI_REGISTER_FORMAT_F16, 2, 0), - 0x007f4004333c00f0); +TEST_F(ValhallPacking, Blend) +{ + CASE( + bi_blend_to(b, bi_null(), bi_register(0), bi_register(60), + bi_fau(BIR_FAU_BLEND_0, false), bi_fau(BIR_FAU_BLEND_0, true), + bi_null(), BI_REGISTER_FORMAT_F16, 2, 0), + 0x007f4004333c00f0); } -TEST_F(ValhallPacking, Mux) { +TEST_F(ValhallPacking, Mux) +{ CASE(bi_mux_i32_to(b, bi_register(0), bi_discard(bi_register(0)), bi_discard(bi_register(4)), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 0), false), BI_MUX_BIT), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false), + BI_MUX_BIT), 0x00b8c00300804440ull); } -TEST_F(ValhallPacking, AtestFP16) { +TEST_F(ValhallPacking, AtestFP16) +{ CASE(bi_atest_to(b, bi_register(60), bi_register(60), bi_half(bi_register(1), true), bi_fau(BIR_FAU_ATEST_PARAM, false)), 0x007dbc0208ea013c); } -TEST_F(ValhallPacking, AtestFP32) { +TEST_F(ValhallPacking, AtestFP32) +{ CASE(bi_atest_to(b, bi_register(60), bi_register(60), one, bi_fau(BIR_FAU_ATEST_PARAM, false)), 0x007dbc0200ead03c); } -TEST_F(ValhallPacking, Transcendentals) { +TEST_F(ValhallPacking, Transcendentals) +{ CASE(bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true), 0x0099c10001000000); - CASE(bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false, true), + CASE(bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false, + true), 0x0099c00001020040); - CASE(bi_frsq_f32_to(b, bi_register(2), bi_register(1)), - 0x009cc20000020001); + CASE(bi_frsq_f32_to(b, bi_register(2), bi_register(1)), 0x009cc20000020001); - CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)), bi_discard(bi_register(2)), bi_neg(zero), bi_discard(bi_register(0)), BI_SPECIAL_LEFT), + CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)), + bi_discard(bi_register(2)), bi_neg(zero), + bi_discard(bi_register(0)), BI_SPECIAL_LEFT), 0x0162c00440c04241); } -TEST_F(ValhallPacking, Csel) { +TEST_F(ValhallPacking, Csel) +{ CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 2), false), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 2), true), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_EQ), 0x0150c10085844342); CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 2), false), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 2), true), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT), 0x0150c10485844342); CASE(bi_csel_s32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 2), false), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 2), true), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT), 0x0158c10485844342); } -TEST_F(ValhallPacking, LdAttrImm) { - bi_instr *I = bi_ld_attr_imm_to(b, bi_register(0), - bi_discard(bi_register(60)), - bi_discard(bi_register(61)), - BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4, 1); +TEST_F(ValhallPacking, LdAttrImm) +{ + bi_instr *I = bi_ld_attr_imm_to( + b, bi_register(0), bi_discard(bi_register(60)), + bi_discard(bi_register(61)), BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4, 1); I->table = 1; CASE(I, 0x0066800433117d7c); } -TEST_F(ValhallPacking, LdVarBufImmF16) { +TEST_F(ValhallPacking, LdVarBufImmF16) +{ CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(2), bi_register(61), BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER, - BI_SOURCE_FORMAT_F16, - BI_UPDATE_RETRIEVE, BI_VECSIZE_V4, 0), + BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE, + BI_VECSIZE_V4, 0), 0x005d82143300003d); CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE, - BI_SOURCE_FORMAT_F16, - BI_UPDATE_STORE, BI_VECSIZE_V4, 0), - 0x005d80843300003d); + BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, + BI_VECSIZE_V4, 0), + 0x005d80843300003d); CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61), BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID, - BI_SOURCE_FORMAT_F16, - BI_UPDATE_STORE, BI_VECSIZE_V4, 8), - 0x005d80443308003d); + BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE, + BI_VECSIZE_V4, 8), + 0x005d80443308003d); } -TEST_F(ValhallPacking, LeaBufImm) { +TEST_F(ValhallPacking, LeaBufImm) +{ CASE(bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59))), 0x005e840400000d7b); } -TEST_F(ValhallPacking, StoreSegment) { +TEST_F(ValhallPacking, StoreSegment) +{ CASE(bi_store_i96(b, bi_register(0), bi_discard(bi_register(4)), - bi_discard(bi_register(5)), BI_SEG_VARY, 0), + bi_discard(bi_register(5)), BI_SEG_VARY, 0), 0x0061400632000044); } -TEST_F(ValhallPacking, Convert16To32) { - CASE(bi_u16_to_u32_to(b, bi_register(2), bi_discard(bi_swz_16(bi_register(55), false, false))), - 0x0090c20000140077); +TEST_F(ValhallPacking, Convert16To32) +{ + CASE(bi_u16_to_u32_to(b, bi_register(2), + bi_discard(bi_swz_16(bi_register(55), false, false))), + 0x0090c20000140077); - CASE(bi_u16_to_u32_to(b, bi_register(2), bi_discard(bi_swz_16(bi_register(55), true, false))), - 0x0090c20010140077); + CASE(bi_u16_to_u32_to(b, bi_register(2), + bi_discard(bi_swz_16(bi_register(55), true, false))), + 0x0090c20010140077); - CASE(bi_u16_to_f32_to(b, bi_register(2), bi_discard(bi_swz_16(bi_register(55), false, false))), - 0x0090c20000150077); + CASE(bi_u16_to_f32_to(b, bi_register(2), + bi_discard(bi_swz_16(bi_register(55), false, false))), + 0x0090c20000150077); - CASE(bi_u16_to_f32_to(b, bi_register(2), bi_discard(bi_swz_16(bi_register(55), true, false))), - 0x0090c20010150077); + CASE(bi_u16_to_f32_to(b, bi_register(2), + bi_discard(bi_swz_16(bi_register(55), true, false))), + 0x0090c20010150077); - CASE(bi_s16_to_s32_to(b, bi_register(2), bi_discard(bi_swz_16(bi_register(55), false, false))), - 0x0090c20000040077); + CASE(bi_s16_to_s32_to(b, bi_register(2), + bi_discard(bi_swz_16(bi_register(55), false, false))), + 0x0090c20000040077); - CASE(bi_s16_to_s32_to(b, bi_register(2), bi_discard(bi_swz_16(bi_register(55), true, false))), - 0x0090c20010040077); + CASE(bi_s16_to_s32_to(b, bi_register(2), + bi_discard(bi_swz_16(bi_register(55), true, false))), + 0x0090c20010040077); } -TEST_F(ValhallPacking, Swizzle8) { - CASE(bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), - zero, zero, BI_CMPF_NE, BI_RESULT_TYPE_I1), +TEST_F(ValhallPacking, Swizzle8) +{ + CASE(bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero, + zero, BI_CMPF_NE, BI_RESULT_TYPE_I1), 0x00f2c14300c0c000); } -TEST_F(ValhallPacking, FauPage1) { - CASE(bi_mov_i32_to(b, bi_register(1), bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 32), false)), - 0x0291c10000000080ULL); +TEST_F(ValhallPacking, FauPage1) +{ + CASE(bi_mov_i32_to(b, bi_register(1), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false)), + 0x0291c10000000080ULL); } -TEST_F(ValhallPacking, LdTileV3F16) { +TEST_F(ValhallPacking, LdTileV3F16) +{ CASE(bi_ld_tile_to(b, bi_register(4), bi_discard(bi_register(0)), - bi_register(60), bi_register(3), - BI_REGISTER_FORMAT_F16, BI_VECSIZE_V3), + bi_register(60), bi_register(3), BI_REGISTER_FORMAT_F16, + BI_VECSIZE_V3), 0x0078840423033c40); } -TEST_F(ValhallPacking, Rhadd8) { +TEST_F(ValhallPacking, Rhadd8) +{ CASE(bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)), bi_discard(bi_register(0)), BI_ROUND_RTP), 0x00aac000400b4041); diff --git a/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp b/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp index a704d31af30..553a9ad343e 100644 --- a/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp +++ b/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp @@ -21,41 +21,44 @@ * SOFTWARE. */ -#include "va_compiler.h" -#include "bi_test.h" #include "bi_builder.h" +#include "bi_test.h" +#include "va_compiler.h" #include -#define CASE(instr, expected) do { \ - if (va_validate_fau(instr) != expected) { \ - fprintf(stderr, "Incorrect validation for:\n"); \ - bi_print_instr(instr, stderr); \ - fprintf(stderr, "\n"); \ - ADD_FAILURE(); \ - } \ -} while(0) +#define CASE(instr, expected) \ + do { \ + if (va_validate_fau(instr) != expected) { \ + fprintf(stderr, "Incorrect validation for:\n"); \ + bi_print_instr(instr, stderr); \ + fprintf(stderr, "\n"); \ + ADD_FAILURE(); \ + } \ + } while (0) -#define VALID(instr) CASE(instr, true) +#define VALID(instr) CASE(instr, true) #define INVALID(instr) CASE(instr, false) class ValidateFau : public testing::Test { -protected: - ValidateFau() { + protected: + ValidateFau() + { mem_ctx = ralloc_context(NULL); b = bit_builder(mem_ctx); - zero = bi_fau((enum bir_fau) (BIR_FAU_IMMEDIATE | 0), false); - imm1 = bi_fau((enum bir_fau) (BIR_FAU_IMMEDIATE | 1), false); - imm2 = bi_fau((enum bir_fau) (BIR_FAU_IMMEDIATE | 2), false); - unif = bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 5), false); - unif_hi = bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 5), true); - unif2 = bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 6), false); + zero = bi_fau((enum bir_fau)(BIR_FAU_IMMEDIATE | 0), false); + imm1 = bi_fau((enum bir_fau)(BIR_FAU_IMMEDIATE | 1), false); + imm2 = bi_fau((enum bir_fau)(BIR_FAU_IMMEDIATE | 2), false); + unif = bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false); + unif_hi = bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), true); + unif2 = bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 6), false); core_id = bi_fau(BIR_FAU_CORE_ID, false); lane_id = bi_fau(BIR_FAU_LANE_ID, false); } - ~ValidateFau() { + ~ValidateFau() + { ralloc_free(mem_ctx); } @@ -66,8 +69,8 @@ protected: TEST_F(ValidateFau, One64BitUniformSlot) { - VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(3), - unif)); + VALID( + bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(3), unif)); VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), unif_hi, unif)); VALID(bi_fma_f32_to(b, bi_register(1), unif, unif, unif_hi)); INVALID(bi_fma_f32_to(b, bi_register(1), unif, unif2, bi_register(1))); @@ -77,8 +80,8 @@ TEST_F(ValidateFau, One64BitUniformSlot) * marked as valid in early versions of the validator. */ INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 0), false), - bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 1), true))); + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false), + bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 1), true))); } TEST_F(ValidateFau, Combined64BitUniformsConstants) @@ -99,17 +102,16 @@ TEST_F(ValidateFau, UniformsOnlyInDefaultMode) TEST_F(ValidateFau, SingleSpecialImmediate) { VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(2), - lane_id)); + lane_id)); VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(2), - core_id)); - INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), lane_id, - core_id)); + core_id)); + INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), lane_id, core_id)); } TEST_F(ValidateFau, SmokeTests) { VALID(bi_mov_i32_to(b, bi_register(1), bi_register(2))); VALID(bi_mov_i32_to(b, bi_register(1), unif)); - VALID(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)), - unif, bi_neg(zero))); + VALID(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)), unif, + bi_neg(zero))); } diff --git a/src/panfrost/bifrost/valhall/va_compiler.h b/src/panfrost/bifrost/valhall/va_compiler.h index 534f0a0ee91..24d71b3536b 100644 --- a/src/panfrost/bifrost/valhall/va_compiler.h +++ b/src/panfrost/bifrost/valhall/va_compiler.h @@ -79,7 +79,7 @@ va_select_fau_page(const bi_instr *I) { bi_foreach_src(I, s) { if (I->src[s].type == BI_INDEX_FAU) - return va_fau_page((enum bir_fau) I->src[s].value); + return va_fau_page((enum bir_fau)I->src[s].value); } return 0; @@ -91,8 +91,7 @@ struct va_stats { unsigned fma, cvt, sfu, v, ls, t; }; -void -va_count_instr_stats(bi_instr *I, struct va_stats *stats); +void va_count_instr_stats(bi_instr *I, struct va_stats *stats); #ifdef __cplusplus } /* extern C */ diff --git a/src/panfrost/bifrost/valhall/va_insert_flow.c b/src/panfrost/bifrost/valhall/va_insert_flow.c index 68eb808b45c..5cbe6a13ad6 100644 --- a/src/panfrost/bifrost/valhall/va_insert_flow.c +++ b/src/panfrost/bifrost/valhall/va_insert_flow.c @@ -21,9 +21,9 @@ * SOFTWARE. */ +#include "bi_builder.h" #include "va_compiler.h" #include "valhall_enums.h" -#include "bi_builder.h" /* * Insert flow control into a scheduled and register allocated shader. This @@ -176,7 +176,8 @@ bi_depend_on_writers(struct bi_scoreboard_state *st, uint64_t regmask) /* Sets the dependencies for a given clause, updating the model */ static void -bi_set_dependencies(bi_block *block, bi_instr *I, struct bi_scoreboard_state *st) +bi_set_dependencies(bi_block *block, bi_instr *I, + struct bi_scoreboard_state *st) { /* Depend on writers to handle read-after-write and write-after-write * dependencies. Write-after-read dependencies are handled in the hardware @@ -482,7 +483,8 @@ va_insert_flow_control_nops(bi_context *ctx) */ if (va_should_end(block) || block->needs_nop) { /* Don't bother adding a NOP into an unreachable block */ - if (block == bi_start_block(&ctx->blocks) || bi_num_predecessors(block)) + if (block == bi_start_block(&ctx->blocks) || + bi_num_predecessors(block)) bi_flow(ctx, bi_after_block(block), VA_FLOW_END); } else if (bi_reconverge_branches(block)) { /* TODO: Do we have ever need to reconverge from an empty block? */ diff --git a/src/panfrost/bifrost/valhall/va_lower_constants.c b/src/panfrost/bifrost/valhall/va_lower_constants.c index e5a8fd7e224..be5a40586c4 100644 --- a/src/panfrost/bifrost/valhall/va_lower_constants.c +++ b/src/panfrost/bifrost/valhall/va_lower_constants.c @@ -21,9 +21,9 @@ * SOFTWARE. */ +#include "bi_builder.h" #include "va_compiler.h" #include "valhall.h" -#include "bi_builder.h" /* Only some special immediates are available, as specified in the Table of * Immediates in the specification. Other immediates must be lowered, either to @@ -51,7 +51,7 @@ va_lut_index_32(uint32_t imm) static bi_index va_lut_index_16(uint16_t imm) { - uint16_t *arr16 = (uint16_t *) valhall_immediates; + uint16_t *arr16 = (uint16_t *)valhall_immediates; for (unsigned i = 0; i < (2 * ARRAY_SIZE(valhall_immediates)); ++i) { if (arr16[i] == imm) @@ -64,7 +64,7 @@ va_lut_index_16(uint16_t imm) UNUSED static bi_index va_lut_index_8(uint8_t imm) { - uint8_t *arr8 = (uint8_t *) valhall_immediates; + uint8_t *arr8 = (uint8_t *)valhall_immediates; for (unsigned i = 0; i < (4 * ARRAY_SIZE(valhall_immediates)); ++i) { if (arr8[i] == imm) @@ -109,36 +109,43 @@ is_extension_of_16(uint32_t x, bool is_signed) } static bi_index -va_resolve_constant(bi_builder *b, uint32_t value, struct va_src_info info, bool is_signed, bool staging) +va_resolve_constant(bi_builder *b, uint32_t value, struct va_src_info info, + bool is_signed, bool staging) { /* Try the constant as-is */ if (!staging) { bi_index lut = va_lut_index_32(value); - if (!bi_is_null(lut)) return lut; + if (!bi_is_null(lut)) + return lut; /* ...or negated as a FP32 constant */ if (info.absneg && info.size == VA_SIZE_32) { lut = bi_neg(va_lut_index_32(fui(-uif(value)))); - if (!bi_is_null(lut)) return lut; + if (!bi_is_null(lut)) + return lut; } /* ...or negated as a FP16 constant */ if (info.absneg && info.size == VA_SIZE_16) { lut = bi_neg(va_lut_index_32(value ^ 0x80008000)); - if (!bi_is_null(lut)) return lut; + if (!bi_is_null(lut)) + return lut; } } /* Try using a single half of a FP16 constant */ bool replicated_halves = (value & 0xFFFF) == (value >> 16); - if (!staging && info.swizzle && info.size == VA_SIZE_16 && replicated_halves) { + if (!staging && info.swizzle && info.size == VA_SIZE_16 && + replicated_halves) { bi_index lut = va_lut_index_16(value & 0xFFFF); - if (!bi_is_null(lut)) return lut; + if (!bi_is_null(lut)) + return lut; /* ...possibly negated */ if (info.absneg) { lut = bi_neg(va_lut_index_16((value & 0xFFFF) ^ 0x8000)); - if (!bi_is_null(lut)) return lut; + if (!bi_is_null(lut)) + return lut; } } @@ -147,25 +154,28 @@ va_resolve_constant(bi_builder *b, uint32_t value, struct va_src_info info, bool is_extension_of_8(value, is_signed)) { bi_index lut = va_lut_index_8(value & 0xFF); - if (!bi_is_null(lut)) return lut; + if (!bi_is_null(lut)) + return lut; } /* Try extending a halfword */ - if (!staging && info.widen && - is_extension_of_16(value, is_signed)) { + if (!staging && info.widen && is_extension_of_16(value, is_signed)) { bi_index lut = va_lut_index_16(value & 0xFFFF); - if (!bi_is_null(lut)) return lut; + if (!bi_is_null(lut)) + return lut; } /* Try demoting the constant to FP16 */ if (!staging && info.swizzle && info.size == VA_SIZE_32) { bi_index lut = va_demote_constant_fp16(value); - if (!bi_is_null(lut)) return lut; + if (!bi_is_null(lut)) + return lut; if (info.absneg) { bi_index lut = bi_neg(va_demote_constant_fp16(fui(-uif(value)))); - if (!bi_is_null(lut)) return lut; + if (!bi_is_null(lut)) + return lut; } } @@ -218,7 +228,8 @@ va_lower_constants(bi_context *ctx, bi_instr *I) value = bi_apply_swizzle(value, swz); } - bi_index cons = va_resolve_constant(&b, value, info, is_signed, staging); + bi_index cons = + va_resolve_constant(&b, value, info, is_signed, staging); cons.neg ^= I->src[s].neg; I->src[s] = cons; diff --git a/src/panfrost/bifrost/valhall/va_lower_isel.c b/src/panfrost/bifrost/valhall/va_lower_isel.c index ec244d66524..284289573f6 100644 --- a/src/panfrost/bifrost/valhall/va_lower_isel.c +++ b/src/panfrost/bifrost/valhall/va_lower_isel.c @@ -21,9 +21,9 @@ * SOFTWARE. */ +#include "bi_builder.h" #include "va_compiler.h" #include "valhall.h" -#include "bi_builder.h" static bi_instr * lower(bi_builder *b, bi_instr *I) @@ -38,45 +38,56 @@ lower(bi_builder *b, bi_instr *I) return bi_iadd_v4u8_to(b, I->dest[0], I->src[0], bi_zero(), false); case BI_OPCODE_ICMP_I32: - return bi_icmp_or_u32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_icmp_or_u32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_ICMP_V2I16: - return bi_icmp_or_v2u16_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_icmp_or_v2u16_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_ICMP_V4I8: - return bi_icmp_or_v4u8_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_icmp_or_v4u8_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_ICMP_U32: - return bi_icmp_or_u32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_icmp_or_u32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_ICMP_V2U16: - return bi_icmp_or_v2u16_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_icmp_or_v2u16_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_ICMP_V4U8: - return bi_icmp_or_v4u8_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_icmp_or_v4u8_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_ICMP_S32: - return bi_icmp_or_s32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_icmp_or_s32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_ICMP_V2S16: - return bi_icmp_or_v2s16_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_icmp_or_v2s16_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_ICMP_V4S8: - return bi_icmp_or_v4s8_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_icmp_or_v4s8_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_FCMP_F32: - return bi_fcmp_or_f32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_fcmp_or_f32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); case BI_OPCODE_FCMP_V2F16: - return bi_fcmp_or_v2f16_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), I->cmpf, I->result_type); + return bi_fcmp_or_v2f16_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(), + I->cmpf, I->result_type); /* Integer CSEL must have a signedness */ case BI_OPCODE_CSEL_I32: case BI_OPCODE_CSEL_V2I16: assert(I->cmpf == BI_CMPF_EQ || I->cmpf == BI_CMPF_NE); - I->op = (I->op == BI_OPCODE_CSEL_I32) ? BI_OPCODE_CSEL_U32 : - BI_OPCODE_CSEL_V2U16; + I->op = (I->op == BI_OPCODE_CSEL_I32) ? BI_OPCODE_CSEL_U32 + : BI_OPCODE_CSEL_V2U16; return NULL; /* Jump -> conditional branch with condition tied to true. */ @@ -117,7 +128,7 @@ lower(bi_builder *b, bi_instr *I) case BI_OPCODE_FADD_RSCALE_F32: return bi_fma_rscale_f32_to(b, I->dest[0], I->src[0], bi_imm_f32(1.0), - I->src[1], I->src[2], I->special); + I->src[1], I->src[2], I->special); default: return NULL; diff --git a/src/panfrost/bifrost/valhall/va_lower_split_64bit.c b/src/panfrost/bifrost/valhall/va_lower_split_64bit.c index 3c67f3e8a46..947138511e4 100644 --- a/src/panfrost/bifrost/valhall/va_lower_split_64bit.c +++ b/src/panfrost/bifrost/valhall/va_lower_split_64bit.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "va_compiler.h" #include "bi_builder.h" +#include "va_compiler.h" /* * Bifrost uses split 64-bit addresses, specified as two consecutive sources. @@ -38,8 +38,7 @@ lower_split_src(bi_context *ctx, bi_instr *I, unsigned s) bi_index offset_fau = I->src[s]; offset_fau.offset++; - if (I->src[s].type == BI_INDEX_FAU && - I->src[s].offset == 0 && + if (I->src[s].type == BI_INDEX_FAU && I->src[s].offset == 0 && bi_is_value_equiv(offset_fau, I->src[s + 1])) { return; } diff --git a/src/panfrost/bifrost/valhall/va_mark_last.c b/src/panfrost/bifrost/valhall/va_mark_last.c index d17c0ec65c8..5c3f1ec3b78 100644 --- a/src/panfrost/bifrost/valhall/va_mark_last.c +++ b/src/panfrost/bifrost/valhall/va_mark_last.c @@ -97,7 +97,7 @@ scoreboard_update(struct bi_scoreboard_state *st, const bi_instr *I) /* Unmark registers after they are waited on */ for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i) { if (waits_on_slot(I->flow, i)) - st->read[i] = 0; + st->read[i] = 0; } } @@ -111,8 +111,8 @@ va_analyze_scoreboard_reads(bi_context *ctx) bi_worklist_push_tail(&worklist, block); /* Reset analysis from previous pass */ - block->scoreboard_in = (struct bi_scoreboard_state){ 0 }; - block->scoreboard_out = (struct bi_scoreboard_state){ 0 }; + block->scoreboard_in = (struct bi_scoreboard_state){0}; + block->scoreboard_out = (struct bi_scoreboard_state){0}; } /* Perform forward data flow analysis to calculate dependencies */ diff --git a/src/panfrost/bifrost/valhall/va_merge_flow.c b/src/panfrost/bifrost/valhall/va_merge_flow.c index 64f3c38c9ae..05de5109260 100644 --- a/src/panfrost/bifrost/valhall/va_merge_flow.c +++ b/src/panfrost/bifrost/valhall/va_merge_flow.c @@ -21,9 +21,9 @@ * SOFTWARE. */ +#include "bi_builder.h" #include "va_compiler.h" #include "valhall_enums.h" -#include "bi_builder.h" /* * Merge NOPs with flow control with nearby instructions to eliminate the NOPs, @@ -80,8 +80,10 @@ merge_end_reconverge(bi_block *block) bi_instr *last = list_last_entry(&block->instructions, bi_instr, link); bi_instr *penult = bi_prev_op(last); - if (last->op != BI_OPCODE_NOP) return; - if (last->flow != VA_FLOW_RECONVERGE && last->flow != VA_FLOW_END) return; + if (last->op != BI_OPCODE_NOP) + return; + if (last->flow != VA_FLOW_RECONVERGE && last->flow != VA_FLOW_END) + return; /* End implies all other flow control except for waiting on barriers (slot * #7, with VA_FLOW_WAIT), so remove blocking flow control. @@ -99,7 +101,8 @@ merge_end_reconverge(bi_block *block) } /* If there is blocking flow control, we can't merge */ - if (penult->flow != VA_FLOW_NONE) return; + if (penult->flow != VA_FLOW_NONE) + return; /* Else, merge */ penult->flow = last->flow; @@ -133,8 +136,8 @@ merge_waits(bi_block *block) bi_instr *last_free = NULL; bi_foreach_instr_in_block_safe(block, I) { - if (last_free != NULL && - I->op == BI_OPCODE_NOP && va_flow_is_wait_or_none(I->flow)) { + if (last_free != NULL && I->op == BI_OPCODE_NOP && + va_flow_is_wait_or_none(I->flow)) { /* Merge waits with compatible instructions */ last_free->flow = union_waits(last_free->flow, I->flow); @@ -212,8 +215,10 @@ va_merge_flow(bi_context *ctx) { bi_foreach_block(ctx, block) { /* If there are less than 2 instructions, there's nothing to merge */ - if (list_is_empty(&block->instructions)) continue; - if (list_is_singular(&block->instructions)) continue; + if (list_is_empty(&block->instructions)) + continue; + if (list_is_singular(&block->instructions)) + continue; merge_end_reconverge(block); merge_waits(block); diff --git a/src/panfrost/bifrost/valhall/va_optimize.c b/src/panfrost/bifrost/valhall/va_optimize.c index a50c4244952..46202e4d52d 100644 --- a/src/panfrost/bifrost/valhall/va_optimize.c +++ b/src/panfrost/bifrost/valhall/va_optimize.c @@ -29,15 +29,21 @@ static enum bi_opcode va_op_add_imm(enum bi_opcode op) { switch (op) { - case BI_OPCODE_FADD_F32: return BI_OPCODE_FADD_IMM_F32; - case BI_OPCODE_FADD_V2F16: return BI_OPCODE_FADD_IMM_V2F16; + case BI_OPCODE_FADD_F32: + return BI_OPCODE_FADD_IMM_F32; + case BI_OPCODE_FADD_V2F16: + return BI_OPCODE_FADD_IMM_V2F16; case BI_OPCODE_IADD_S32: - case BI_OPCODE_IADD_U32: return BI_OPCODE_IADD_IMM_I32; + case BI_OPCODE_IADD_U32: + return BI_OPCODE_IADD_IMM_I32; case BI_OPCODE_IADD_V2S16: - case BI_OPCODE_IADD_V2U16: return BI_OPCODE_IADD_IMM_V2I16; + case BI_OPCODE_IADD_V2U16: + return BI_OPCODE_IADD_IMM_V2I16; case BI_OPCODE_IADD_V4S8: - case BI_OPCODE_IADD_V4U8: return BI_OPCODE_IADD_IMM_V4I8; - default: return 0; + case BI_OPCODE_IADD_V4U8: + return BI_OPCODE_IADD_IMM_V4I8; + default: + return 0; } } @@ -46,8 +52,8 @@ va_is_add_imm(bi_instr *I, unsigned s) { assert(s < I->nr_srcs); - return I->src[s].swizzle == BI_SWIZZLE_H01 && - !I->src[s].abs && !I->src[s].neg && !I->clamp && !I->round; + return I->src[s].swizzle == BI_SWIZZLE_H01 && !I->src[s].abs && + !I->src[s].neg && !I->clamp && !I->round; } static unsigned @@ -83,11 +89,14 @@ va_fuse_add_imm(bi_instr *I) } enum bi_opcode op = va_op_add_imm(I->op); - if (!op) return; + if (!op) + return; unsigned s = va_choose_imm(I); - if (s > 1) return; - if (!va_is_add_imm(I, 1 - s)) return; + if (s > 1) + return; + if (!va_is_add_imm(I, 1 - s)) + return; I->op = op; I->index = bi_apply_swizzle(I->src[s].value, I->src[s].swizzle); diff --git a/src/panfrost/bifrost/valhall/va_pack.c b/src/panfrost/bifrost/valhall/va_pack.c index 33e7dcdf079..f6381c6f5f0 100644 --- a/src/panfrost/bifrost/valhall/va_pack.c +++ b/src/panfrost/bifrost/valhall/va_pack.c @@ -21,10 +21,10 @@ * SOFTWARE. */ +#include "bi_builder.h" #include "va_compiler.h" #include "valhall.h" #include "valhall_enums.h" -#include "bi_builder.h" /* This file contains the final passes of the compiler. Running after * scheduling and RA, the IR is now finalized, so we need to emit it to actual @@ -36,7 +36,7 @@ * Prints the (first) failing instruction to aid debugging. */ NORETURN static void PRINTFLIKE(2, 3) -invalid_instruction(const bi_instr *I, const char *cause, ...) + invalid_instruction(const bi_instr *I, const char *cause, ...) { fputs("\nInvalid ", stderr); @@ -56,8 +56,9 @@ invalid_instruction(const bi_instr *I, const char *cause, ...) * Like assert, but prints the instruction if the assertion fails to aid * debugging invalid inputs to the packing module. */ -#define pack_assert(I, cond) \ - if (!(cond)) invalid_instruction(I, "invariant " #cond); +#define pack_assert(I, cond) \ + if (!(cond)) \ + invalid_instruction(I, "invariant " #cond); /* * Validate that two adjacent 32-bit sources form an aligned 64-bit register @@ -95,14 +96,20 @@ static unsigned va_pack_fau_special(const bi_instr *I, enum bir_fau fau) { switch (fau) { - case BIR_FAU_ATEST_PARAM: return VA_FAU_SPECIAL_PAGE_0_ATEST_DATUM; - case BIR_FAU_TLS_PTR: return VA_FAU_SPECIAL_PAGE_1_THREAD_LOCAL_POINTER; - case BIR_FAU_WLS_PTR: return VA_FAU_SPECIAL_PAGE_1_WORKGROUP_LOCAL_POINTER; - case BIR_FAU_LANE_ID: return VA_FAU_SPECIAL_PAGE_3_LANE_ID; - case BIR_FAU_PROGRAM_COUNTER: return VA_FAU_SPECIAL_PAGE_3_PROGRAM_COUNTER; - case BIR_FAU_SAMPLE_POS_ARRAY:return VA_FAU_SPECIAL_PAGE_0_SAMPLE; + case BIR_FAU_ATEST_PARAM: + return VA_FAU_SPECIAL_PAGE_0_ATEST_DATUM; + case BIR_FAU_TLS_PTR: + return VA_FAU_SPECIAL_PAGE_1_THREAD_LOCAL_POINTER; + case BIR_FAU_WLS_PTR: + return VA_FAU_SPECIAL_PAGE_1_WORKGROUP_LOCAL_POINTER; + case BIR_FAU_LANE_ID: + return VA_FAU_SPECIAL_PAGE_3_LANE_ID; + case BIR_FAU_PROGRAM_COUNTER: + return VA_FAU_SPECIAL_PAGE_3_PROGRAM_COUNTER; + case BIR_FAU_SAMPLE_POS_ARRAY: + return VA_FAU_SPECIAL_PAGE_0_SAMPLE; - case BIR_FAU_BLEND_0...(BIR_FAU_BLEND_0 + 7): + case BIR_FAU_BLEND_0 ...(BIR_FAU_BLEND_0 + 7): return VA_FAU_SPECIAL_PAGE_0_BLEND_DESCRIPTOR_0 + (fau - BIR_FAU_BLEND_0); default: @@ -136,7 +143,8 @@ va_pack_src(const bi_instr *I, unsigned s) if (idx.type == BI_INDEX_REGISTER) { unsigned value = va_pack_reg(I, idx); - if (idx.discard) value |= (1 << 6); + if (idx.discard) + value |= (1 << 6); return value; } else if (idx.type == BI_INDEX_FAU) { pack_assert(I, idx.offset <= 1); @@ -150,10 +158,14 @@ static unsigned va_pack_wrmask(const bi_instr *I) { switch (I->dest[0].swizzle) { - case BI_SWIZZLE_H00: return 0x1; - case BI_SWIZZLE_H11: return 0x2; - case BI_SWIZZLE_H01: return 0x3; - default: invalid_instruction(I, "write mask"); + case BI_SWIZZLE_H00: + return 0x1; + case BI_SWIZZLE_H11: + return 0x2; + case BI_SWIZZLE_H01: + return 0x3; + default: + invalid_instruction(I, "write mask"); } } @@ -161,17 +173,27 @@ static enum va_atomic_operation va_pack_atom_opc(const bi_instr *I) { switch (I->atom_opc) { - case BI_ATOM_OPC_AADD: return VA_ATOMIC_OPERATION_AADD; - case BI_ATOM_OPC_ASMIN: return VA_ATOMIC_OPERATION_ASMIN; - case BI_ATOM_OPC_ASMAX: return VA_ATOMIC_OPERATION_ASMAX; - case BI_ATOM_OPC_AUMIN: return VA_ATOMIC_OPERATION_AUMIN; - case BI_ATOM_OPC_AUMAX: return VA_ATOMIC_OPERATION_AUMAX; - case BI_ATOM_OPC_AAND: return VA_ATOMIC_OPERATION_AAND; - case BI_ATOM_OPC_AOR: return VA_ATOMIC_OPERATION_AOR; - case BI_ATOM_OPC_AXOR: return VA_ATOMIC_OPERATION_AXOR; + case BI_ATOM_OPC_AADD: + return VA_ATOMIC_OPERATION_AADD; + case BI_ATOM_OPC_ASMIN: + return VA_ATOMIC_OPERATION_ASMIN; + case BI_ATOM_OPC_ASMAX: + return VA_ATOMIC_OPERATION_ASMAX; + case BI_ATOM_OPC_AUMIN: + return VA_ATOMIC_OPERATION_AUMIN; + case BI_ATOM_OPC_AUMAX: + return VA_ATOMIC_OPERATION_AUMAX; + case BI_ATOM_OPC_AAND: + return VA_ATOMIC_OPERATION_AAND; + case BI_ATOM_OPC_AOR: + return VA_ATOMIC_OPERATION_AOR; + case BI_ATOM_OPC_AXOR: + return VA_ATOMIC_OPERATION_AXOR; case BI_ATOM_OPC_ACMPXCHG: - case BI_ATOM_OPC_AXCHG: return VA_ATOMIC_OPERATION_AXCHG; - default: invalid_instruction(I, "atomic opcode"); + case BI_ATOM_OPC_AXCHG: + return VA_ATOMIC_OPERATION_AXCHG; + default: + invalid_instruction(I, "atomic opcode"); } } @@ -179,12 +201,18 @@ static enum va_atomic_operation_with_1 va_pack_atom_opc_1(const bi_instr *I) { switch (I->atom_opc) { - case BI_ATOM_OPC_AINC: return VA_ATOMIC_OPERATION_WITH_1_AINC; - case BI_ATOM_OPC_ADEC: return VA_ATOMIC_OPERATION_WITH_1_ADEC; - case BI_ATOM_OPC_AUMAX1: return VA_ATOMIC_OPERATION_WITH_1_AUMAX1; - case BI_ATOM_OPC_ASMAX1: return VA_ATOMIC_OPERATION_WITH_1_ASMAX1; - case BI_ATOM_OPC_AOR1: return VA_ATOMIC_OPERATION_WITH_1_AOR1; - default: invalid_instruction(I, "atomic opcode with implicit 1"); + case BI_ATOM_OPC_AINC: + return VA_ATOMIC_OPERATION_WITH_1_AINC; + case BI_ATOM_OPC_ADEC: + return VA_ATOMIC_OPERATION_WITH_1_ADEC; + case BI_ATOM_OPC_AUMAX1: + return VA_ATOMIC_OPERATION_WITH_1_AUMAX1; + case BI_ATOM_OPC_ASMAX1: + return VA_ATOMIC_OPERATION_WITH_1_ASMAX1; + case BI_ATOM_OPC_AOR1: + return VA_ATOMIC_OPERATION_WITH_1_AOR1; + default: + invalid_instruction(I, "atomic opcode with implicit 1"); } } @@ -199,10 +227,14 @@ static enum va_widen va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz) { switch (swz) { - case BI_SWIZZLE_H01: return VA_WIDEN_NONE; - case BI_SWIZZLE_H00: return VA_WIDEN_H0; - case BI_SWIZZLE_H11: return VA_WIDEN_H1; - default: invalid_instruction(I, "widen"); + case BI_SWIZZLE_H01: + return VA_WIDEN_NONE; + case BI_SWIZZLE_H00: + return VA_WIDEN_H0; + case BI_SWIZZLE_H11: + return VA_WIDEN_H1; + default: + invalid_instruction(I, "widen"); } } @@ -210,11 +242,16 @@ static enum va_swizzles_16_bit va_pack_swizzle_f16(const bi_instr *I, enum bi_swizzle swz) { switch (swz) { - case BI_SWIZZLE_H00: return VA_SWIZZLES_16_BIT_H00; - case BI_SWIZZLE_H10: return VA_SWIZZLES_16_BIT_H10; - case BI_SWIZZLE_H01: return VA_SWIZZLES_16_BIT_H01; - case BI_SWIZZLE_H11: return VA_SWIZZLES_16_BIT_H11; - default: invalid_instruction(I, "16-bit swizzle"); + case BI_SWIZZLE_H00: + return VA_SWIZZLES_16_BIT_H00; + case BI_SWIZZLE_H10: + return VA_SWIZZLES_16_BIT_H10; + case BI_SWIZZLE_H01: + return VA_SWIZZLES_16_BIT_H01; + case BI_SWIZZLE_H11: + return VA_SWIZZLES_16_BIT_H11; + default: + invalid_instruction(I, "16-bit swizzle"); } } @@ -223,37 +260,62 @@ va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size) { if (size == VA_SIZE_8) { switch (swz) { - case BI_SWIZZLE_H01: return VA_SWIZZLES_8_BIT_B0123; - case BI_SWIZZLE_H00: return VA_SWIZZLES_8_BIT_B0101; - case BI_SWIZZLE_H11: return VA_SWIZZLES_8_BIT_B2323; - case BI_SWIZZLE_B0000: return VA_SWIZZLES_8_BIT_B0000; - case BI_SWIZZLE_B1111: return VA_SWIZZLES_8_BIT_B1111; - case BI_SWIZZLE_B2222: return VA_SWIZZLES_8_BIT_B2222; - case BI_SWIZZLE_B3333: return VA_SWIZZLES_8_BIT_B3333; - default: invalid_instruction(I, "8-bit widen"); + case BI_SWIZZLE_H01: + return VA_SWIZZLES_8_BIT_B0123; + case BI_SWIZZLE_H00: + return VA_SWIZZLES_8_BIT_B0101; + case BI_SWIZZLE_H11: + return VA_SWIZZLES_8_BIT_B2323; + case BI_SWIZZLE_B0000: + return VA_SWIZZLES_8_BIT_B0000; + case BI_SWIZZLE_B1111: + return VA_SWIZZLES_8_BIT_B1111; + case BI_SWIZZLE_B2222: + return VA_SWIZZLES_8_BIT_B2222; + case BI_SWIZZLE_B3333: + return VA_SWIZZLES_8_BIT_B3333; + default: + invalid_instruction(I, "8-bit widen"); } } else if (size == VA_SIZE_16) { switch (swz) { - case BI_SWIZZLE_H00: return VA_SWIZZLES_16_BIT_H00; - case BI_SWIZZLE_H10: return VA_SWIZZLES_16_BIT_H10; - case BI_SWIZZLE_H01: return VA_SWIZZLES_16_BIT_H01; - case BI_SWIZZLE_H11: return VA_SWIZZLES_16_BIT_H11; - case BI_SWIZZLE_B0000: return VA_SWIZZLES_16_BIT_B00; - case BI_SWIZZLE_B1111: return VA_SWIZZLES_16_BIT_B11; - case BI_SWIZZLE_B2222: return VA_SWIZZLES_16_BIT_B22; - case BI_SWIZZLE_B3333: return VA_SWIZZLES_16_BIT_B33; - default: invalid_instruction(I, "16-bit widen"); + case BI_SWIZZLE_H00: + return VA_SWIZZLES_16_BIT_H00; + case BI_SWIZZLE_H10: + return VA_SWIZZLES_16_BIT_H10; + case BI_SWIZZLE_H01: + return VA_SWIZZLES_16_BIT_H01; + case BI_SWIZZLE_H11: + return VA_SWIZZLES_16_BIT_H11; + case BI_SWIZZLE_B0000: + return VA_SWIZZLES_16_BIT_B00; + case BI_SWIZZLE_B1111: + return VA_SWIZZLES_16_BIT_B11; + case BI_SWIZZLE_B2222: + return VA_SWIZZLES_16_BIT_B22; + case BI_SWIZZLE_B3333: + return VA_SWIZZLES_16_BIT_B33; + default: + invalid_instruction(I, "16-bit widen"); } } else if (size == VA_SIZE_32) { switch (swz) { - case BI_SWIZZLE_H01: return VA_SWIZZLES_32_BIT_NONE; - case BI_SWIZZLE_H00: return VA_SWIZZLES_32_BIT_H0; - case BI_SWIZZLE_H11: return VA_SWIZZLES_32_BIT_H1; - case BI_SWIZZLE_B0000: return VA_SWIZZLES_32_BIT_B0; - case BI_SWIZZLE_B1111: return VA_SWIZZLES_32_BIT_B1; - case BI_SWIZZLE_B2222: return VA_SWIZZLES_32_BIT_B2; - case BI_SWIZZLE_B3333: return VA_SWIZZLES_32_BIT_B3; - default: invalid_instruction(I, "32-bit widen"); + case BI_SWIZZLE_H01: + return VA_SWIZZLES_32_BIT_NONE; + case BI_SWIZZLE_H00: + return VA_SWIZZLES_32_BIT_H0; + case BI_SWIZZLE_H11: + return VA_SWIZZLES_32_BIT_H1; + case BI_SWIZZLE_B0000: + return VA_SWIZZLES_32_BIT_B0; + case BI_SWIZZLE_B1111: + return VA_SWIZZLES_32_BIT_B1; + case BI_SWIZZLE_B2222: + return VA_SWIZZLES_32_BIT_B2; + case BI_SWIZZLE_B3333: + return VA_SWIZZLES_32_BIT_B3; + default: + invalid_instruction(I, "32-bit widen"); } } else { invalid_instruction(I, "type size for widen"); @@ -264,14 +326,22 @@ static enum va_half_swizzles_8_bit va_pack_halfswizzle(const bi_instr *I, enum bi_swizzle swz) { switch (swz) { - case BI_SWIZZLE_B0000: return VA_HALF_SWIZZLES_8_BIT_B00; - case BI_SWIZZLE_B1111: return VA_HALF_SWIZZLES_8_BIT_B11; - case BI_SWIZZLE_B2222: return VA_HALF_SWIZZLES_8_BIT_B22; - case BI_SWIZZLE_B3333: return VA_HALF_SWIZZLES_8_BIT_B33; - case BI_SWIZZLE_B0011: return VA_HALF_SWIZZLES_8_BIT_B01; - case BI_SWIZZLE_B2233: return VA_HALF_SWIZZLES_8_BIT_B23; - case BI_SWIZZLE_B0022: return VA_HALF_SWIZZLES_8_BIT_B02; - default: invalid_instruction(I, "v2u8 swizzle"); + case BI_SWIZZLE_B0000: + return VA_HALF_SWIZZLES_8_BIT_B00; + case BI_SWIZZLE_B1111: + return VA_HALF_SWIZZLES_8_BIT_B11; + case BI_SWIZZLE_B2222: + return VA_HALF_SWIZZLES_8_BIT_B22; + case BI_SWIZZLE_B3333: + return VA_HALF_SWIZZLES_8_BIT_B33; + case BI_SWIZZLE_B0011: + return VA_HALF_SWIZZLES_8_BIT_B01; + case BI_SWIZZLE_B2233: + return VA_HALF_SWIZZLES_8_BIT_B23; + case BI_SWIZZLE_B0022: + return VA_HALF_SWIZZLES_8_BIT_B02; + default: + invalid_instruction(I, "v2u8 swizzle"); } } @@ -279,12 +349,18 @@ static enum va_lanes_8_bit va_pack_shift_lanes(const bi_instr *I, enum bi_swizzle swz) { switch (swz) { - case BI_SWIZZLE_H01: return VA_LANES_8_BIT_B02; - case BI_SWIZZLE_B0000: return VA_LANES_8_BIT_B00; - case BI_SWIZZLE_B1111: return VA_LANES_8_BIT_B11; - case BI_SWIZZLE_B2222: return VA_LANES_8_BIT_B22; - case BI_SWIZZLE_B3333: return VA_LANES_8_BIT_B33; - default: invalid_instruction(I, "lane shift"); + case BI_SWIZZLE_H01: + return VA_LANES_8_BIT_B02; + case BI_SWIZZLE_B0000: + return VA_LANES_8_BIT_B00; + case BI_SWIZZLE_B1111: + return VA_LANES_8_BIT_B11; + case BI_SWIZZLE_B2222: + return VA_LANES_8_BIT_B22; + case BI_SWIZZLE_B3333: + return VA_LANES_8_BIT_B33; + default: + invalid_instruction(I, "lane shift"); } } @@ -292,10 +368,14 @@ static enum va_combine va_pack_combine(const bi_instr *I, enum bi_swizzle swz) { switch (swz) { - case BI_SWIZZLE_H01: return VA_COMBINE_NONE; - case BI_SWIZZLE_H00: return VA_COMBINE_H0; - case BI_SWIZZLE_H11: return VA_COMBINE_H1; - default: invalid_instruction(I, "branch lane"); + case BI_SWIZZLE_H01: + return VA_COMBINE_NONE; + case BI_SWIZZLE_H00: + return VA_COMBINE_H0; + case BI_SWIZZLE_H11: + return VA_COMBINE_H1; + default: + invalid_instruction(I, "branch lane"); } } @@ -303,10 +383,14 @@ static enum va_source_format va_pack_source_format(const bi_instr *I) { switch (I->source_format) { - case BI_SOURCE_FORMAT_FLAT32: return VA_SOURCE_FORMAT_SRC_FLAT32; - case BI_SOURCE_FORMAT_FLAT16: return VA_SOURCE_FORMAT_SRC_FLAT16; - case BI_SOURCE_FORMAT_F32: return VA_SOURCE_FORMAT_SRC_F32; - case BI_SOURCE_FORMAT_F16: return VA_SOURCE_FORMAT_SRC_F16; + case BI_SOURCE_FORMAT_FLAT32: + return VA_SOURCE_FORMAT_SRC_FLAT32; + case BI_SOURCE_FORMAT_FLAT16: + return VA_SOURCE_FORMAT_SRC_FLAT16; + case BI_SOURCE_FORMAT_F32: + return VA_SOURCE_FORMAT_SRC_F32; + case BI_SOURCE_FORMAT_F16: + return VA_SOURCE_FORMAT_SRC_F16; } invalid_instruction(I, "source format"); @@ -316,9 +400,12 @@ static uint64_t va_pack_rhadd(const bi_instr *I) { switch (I->round) { - case BI_ROUND_RTN: return 0; /* hadd */ - case BI_ROUND_RTP: return BITFIELD_BIT(30); /* rhadd */ - default: unreachable("Invalid round for HADD"); + case BI_ROUND_RTN: + return 0; /* hadd */ + case BI_ROUND_RTP: + return BITFIELD_BIT(30); /* rhadd */ + default: + unreachable("Invalid round for HADD"); } } @@ -334,15 +421,17 @@ va_pack_alu(const bi_instr *I) case BI_OPCODE_FREXPE_V2F16: case BI_OPCODE_FREXPM_F32: case BI_OPCODE_FREXPM_V2F16: - if (I->sqrt) hex |= 1ull << 24; - if (I->log) hex |= 1ull << 25; + if (I->sqrt) + hex |= 1ull << 24; + if (I->log) + hex |= 1ull << 25; break; /* Add mux type */ case BI_OPCODE_MUX_I32: case BI_OPCODE_MUX_V2I16: case BI_OPCODE_MUX_V4I8: - hex |= (uint64_t) I->mux << 32; + hex |= (uint64_t)I->mux << 32; break; /* Add .eq flag */ @@ -350,12 +439,13 @@ va_pack_alu(const bi_instr *I) case BI_OPCODE_BRANCHZI: pack_assert(I, I->cmpf == BI_CMPF_EQ || I->cmpf == BI_CMPF_NE); - if (I->cmpf == BI_CMPF_EQ) hex |= (1ull << 36); + if (I->cmpf == BI_CMPF_EQ) + hex |= (1ull << 36); if (I->op == BI_OPCODE_BRANCHZI) hex |= (0x1ull << 40); /* Absolute */ else - hex |= ((uint64_t) I->branch_offset & BITFIELD_MASK(27)) << 8; + hex |= ((uint64_t)I->branch_offset & BITFIELD_MASK(27)) << 8; break; @@ -369,7 +459,7 @@ va_pack_alu(const bi_instr *I) case BI_OPCODE_RSHIFT_XOR_I32: case BI_OPCODE_RSHIFT_XOR_V2I16: case BI_OPCODE_RSHIFT_XOR_V4I8: - hex |= (uint64_t) I->arithmetic << 34; + hex |= (uint64_t)I->arithmetic << 34; break; case BI_OPCODE_LEA_BUF_IMM: @@ -378,8 +468,8 @@ va_pack_alu(const bi_instr *I) break; case BI_OPCODE_LEA_ATTR_IMM: - hex |= ((uint64_t) I->table) << 16; - hex |= ((uint64_t) I->attribute_index) << 20; + hex |= ((uint64_t)I->table) << 16; + hex |= ((uint64_t)I->attribute_index) << 20; break; case BI_OPCODE_IADD_IMM_I32: @@ -387,13 +477,13 @@ va_pack_alu(const bi_instr *I) case BI_OPCODE_IADD_IMM_V4I8: case BI_OPCODE_FADD_IMM_F32: case BI_OPCODE_FADD_IMM_V2F16: - hex |= ((uint64_t) I->index) << 8; + hex |= ((uint64_t)I->index) << 8; break; case BI_OPCODE_CLPER_I32: - hex |= ((uint64_t) I->inactive_result) << 22; - hex |= ((uint64_t) I->lane_op) << 32; - hex |= ((uint64_t) I->subgroup) << 36; + hex |= ((uint64_t)I->inactive_result) << 22; + hex |= ((uint64_t)I->lane_op) << 32; + hex |= ((uint64_t)I->subgroup) << 36; break; case BI_OPCODE_LD_VAR: @@ -406,35 +496,37 @@ va_pack_alu(const bi_instr *I) case BI_OPCODE_LD_VAR_BUF_IMM_F32: case BI_OPCODE_LD_VAR_SPECIAL: if (I->op == BI_OPCODE_LD_VAR_SPECIAL) - hex |= ((uint64_t) I->varying_name) << 12; /* instead of index */ + hex |= ((uint64_t)I->varying_name) << 12; /* instead of index */ else if (I->op == BI_OPCODE_LD_VAR_BUF_IMM_F16 || I->op == BI_OPCODE_LD_VAR_BUF_IMM_F32) { - hex |= ((uint64_t) I->index) << 16; + hex |= ((uint64_t)I->index) << 16; } else if (I->op == BI_OPCODE_LD_VAR_IMM || I->op == BI_OPCODE_LD_VAR_FLAT_IMM) { - hex |= ((uint64_t) I->table) << 8; - hex |= ((uint64_t) I->index) << 12; + hex |= ((uint64_t)I->table) << 8; + hex |= ((uint64_t)I->index) << 12; } - hex |= ((uint64_t) va_pack_source_format(I)) << 24; - hex |= ((uint64_t) I->update) << 36; - hex |= ((uint64_t) I->sample) << 38; + hex |= ((uint64_t)va_pack_source_format(I)) << 24; + hex |= ((uint64_t)I->update) << 36; + hex |= ((uint64_t)I->sample) << 38; break; case BI_OPCODE_LD_ATTR_IMM: - hex |= ((uint64_t) I->table) << 16; - hex |= ((uint64_t) I->attribute_index) << 20; + hex |= ((uint64_t)I->table) << 16; + hex |= ((uint64_t)I->attribute_index) << 20; break; case BI_OPCODE_LD_TEX_IMM: case BI_OPCODE_LEA_TEX_IMM: - hex |= ((uint64_t) I->table) << 16; - hex |= ((uint64_t) I->texture_index) << 20; + hex |= ((uint64_t)I->table) << 16; + hex |= ((uint64_t)I->texture_index) << 20; break; case BI_OPCODE_ZS_EMIT: - if (I->stencil) hex |= (1 << 24); - if (I->z) hex |= (1 << 25); + if (I->stencil) + hex |= (1 << 24); + if (I->z) + hex |= (1 << 25); break; default: @@ -444,14 +536,14 @@ va_pack_alu(const bi_instr *I) /* FMA_RSCALE.f32 special modes treated as extra opcodes */ if (I->op == BI_OPCODE_FMA_RSCALE_F32) { pack_assert(I, I->special < 4); - hex |= ((uint64_t) I->special) << 48; + hex |= ((uint64_t)I->special) << 48; } /* Add the normal destination or a placeholder. Staging destinations are * added elsewhere, as they require special handling for control fields. */ if (info.has_dest && info.nr_staging_dests == 0) { - hex |= (uint64_t) va_pack_dest(I) << 40; + hex |= (uint64_t)va_pack_dest(I) << 40; } else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) { pack_assert(I, I->nr_dests == 0); hex |= 0xC0ull << 40; /* Placeholder */ @@ -469,19 +561,24 @@ va_pack_alu(const bi_instr *I) enum va_size size = src_info.size; bi_index src = I->src[logical_i + src_offset]; - hex |= (uint64_t) va_pack_src(I, logical_i + src_offset) << (8 * i); + hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i); if (src_info.notted) { - if (src.neg) hex |= (1ull << 35); + if (src.neg) + hex |= (1ull << 35); } else if (src_info.absneg) { unsigned neg_offs = 32 + 2 + ((2 - i) * 2); unsigned abs_offs = 33 + 2 + ((2 - i) * 2); - if (src.neg) hex |= 1ull << neg_offs; - if (src.abs) hex |= 1ull << abs_offs; + if (src.neg) + hex |= 1ull << neg_offs; + if (src.abs) + hex |= 1ull << abs_offs; } else { - if (src.neg) invalid_instruction(I, "negate"); - if (src.abs) invalid_instruction(I, "absolute value"); + if (src.neg) + invalid_instruction(I, "negate"); + if (src.abs) + invalid_instruction(I, "absolute value"); } if (src_info.swizzle) { @@ -489,50 +586,56 @@ va_pack_alu(const bi_instr *I) unsigned S = src.swizzle; pack_assert(I, size == VA_SIZE_16 || size == VA_SIZE_32); - uint64_t v = (size == VA_SIZE_32 ? va_pack_widen_f32(I, S) : va_pack_swizzle_f16(I, S)); + uint64_t v = (size == VA_SIZE_32 ? va_pack_widen_f32(I, S) + : va_pack_swizzle_f16(I, S)); hex |= v << offs; } else if (src_info.widen) { unsigned offs = (i == 1) ? 26 : 36; - hex |= (uint64_t) va_pack_widen(I, src.swizzle, src_info.size) << offs; + hex |= (uint64_t)va_pack_widen(I, src.swizzle, src_info.size) << offs; } else if (src_info.lane) { - unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ? - ((i == 0) ? 38 : 36) : - 28; + unsigned offs = + (I->op == BI_OPCODE_MKVEC_V2I8) ? ((i == 0) ? 38 : 36) : 28; if (src_info.size == VA_SIZE_16) { hex |= (src.swizzle == BI_SWIZZLE_H11 ? 1 : 0) << offs; } else if (I->op == BI_OPCODE_BRANCHZ_I16) { - hex |= ((uint64_t) va_pack_combine(I, src.swizzle) << 37); + hex |= ((uint64_t)va_pack_combine(I, src.swizzle) << 37); } else { pack_assert(I, src_info.size == VA_SIZE_8); unsigned comp = src.swizzle - BI_SWIZZLE_B0000; pack_assert(I, comp < 4); - hex |= (uint64_t) comp << offs; + hex |= (uint64_t)comp << offs; } } else if (src_info.lanes) { pack_assert(I, src_info.size == VA_SIZE_8); pack_assert(I, i == 1); - hex |= (uint64_t) va_pack_shift_lanes(I, src.swizzle) << 26; + hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26; } else if (src_info.combine) { /* Treat as swizzle, subgroup ops not yet supported */ pack_assert(I, src_info.size == VA_SIZE_32); pack_assert(I, i == 0); - hex |= (uint64_t) va_pack_widen_f32(I, src.swizzle) << 37; + hex |= (uint64_t)va_pack_widen_f32(I, src.swizzle) << 37; } else if (src_info.halfswizzle) { pack_assert(I, src_info.size == VA_SIZE_8); pack_assert(I, i == 0); - hex |= (uint64_t) va_pack_halfswizzle(I, src.swizzle) << 36; + hex |= (uint64_t)va_pack_halfswizzle(I, src.swizzle) << 36; } else if (src.swizzle != BI_SWIZZLE_H01) { invalid_instruction(I, "swizzle"); } } - if (info.saturate) hex |= (uint64_t) I->saturate << 30; - if (info.rhadd) hex |= va_pack_rhadd(I); - if (info.clamp) hex |= (uint64_t) I->clamp << 32; - if (info.round_mode) hex |= (uint64_t) I->round << 30; - if (info.condition) hex |= (uint64_t) I->cmpf << 32; - if (info.result_type) hex |= (uint64_t) I->result_type << 30; + if (info.saturate) + hex |= (uint64_t)I->saturate << 30; + if (info.rhadd) + hex |= va_pack_rhadd(I); + if (info.clamp) + hex |= (uint64_t)I->clamp << 32; + if (info.round_mode) + hex |= (uint64_t)I->round << 30; + if (info.condition) + hex |= (uint64_t)I->cmpf << 32; + if (info.result_type) + hex |= (uint64_t)I->result_type << 30; return hex; } @@ -541,37 +644,35 @@ static uint64_t va_pack_byte_offset(const bi_instr *I) { int16_t offset = I->byte_offset; - if (offset != I->byte_offset) invalid_instruction(I, "byte offset"); + if (offset != I->byte_offset) + invalid_instruction(I, "byte offset"); uint16_t offset_as_u16 = offset; - return ((uint64_t) offset_as_u16) << 8; + return ((uint64_t)offset_as_u16) << 8; } static uint64_t va_pack_byte_offset_8(const bi_instr *I) { uint8_t offset = I->byte_offset; - if (offset != I->byte_offset) invalid_instruction(I, "byte offset"); + if (offset != I->byte_offset) + invalid_instruction(I, "byte offset"); - return ((uint64_t) offset) << 8; + return ((uint64_t)offset) << 8; } static uint64_t va_pack_load(const bi_instr *I, bool buffer_descriptor) { const uint8_t load_lane_identity[8] = { - VA_LOAD_LANE_8_BIT_B0, - VA_LOAD_LANE_16_BIT_H0, - VA_LOAD_LANE_24_BIT_IDENTITY, - VA_LOAD_LANE_32_BIT_W0, - VA_LOAD_LANE_48_BIT_IDENTITY, - VA_LOAD_LANE_64_BIT_IDENTITY, - VA_LOAD_LANE_96_BIT_IDENTITY, - VA_LOAD_LANE_128_BIT_IDENTITY, + VA_LOAD_LANE_8_BIT_B0, VA_LOAD_LANE_16_BIT_H0, + VA_LOAD_LANE_24_BIT_IDENTITY, VA_LOAD_LANE_32_BIT_W0, + VA_LOAD_LANE_48_BIT_IDENTITY, VA_LOAD_LANE_64_BIT_IDENTITY, + VA_LOAD_LANE_96_BIT_IDENTITY, VA_LOAD_LANE_128_BIT_IDENTITY, }; unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7; - uint64_t hex = (uint64_t) load_lane_identity[memory_size] << 36; + uint64_t hex = (uint64_t)load_lane_identity[memory_size] << 36; // unsigned hex |= (1ull << 39); @@ -579,10 +680,10 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor) if (!buffer_descriptor) hex |= va_pack_byte_offset(I); - hex |= (uint64_t) va_pack_src(I, 0) << 0; + hex |= (uint64_t)va_pack_src(I, 0) << 0; if (buffer_descriptor) - hex |= (uint64_t) va_pack_src(I, 1) << 8; + hex |= (uint64_t)va_pack_src(I, 1) << 8; return hex; } @@ -591,10 +692,14 @@ static uint64_t va_pack_memory_access(const bi_instr *I) { switch (I->seg) { - case BI_SEG_TL: return VA_MEMORY_ACCESS_FORCE; - case BI_SEG_POS: return VA_MEMORY_ACCESS_ISTREAM; - case BI_SEG_VARY: return VA_MEMORY_ACCESS_ESTREAM; - default: return VA_MEMORY_ACCESS_NONE; + case BI_SEG_TL: + return VA_MEMORY_ACCESS_FORCE; + case BI_SEG_POS: + return VA_MEMORY_ACCESS_ISTREAM; + case BI_SEG_VARY: + return VA_MEMORY_ACCESS_ESTREAM; + default: + return VA_MEMORY_ACCESS_NONE; } } @@ -604,7 +709,7 @@ va_pack_store(const bi_instr *I) uint64_t hex = va_pack_memory_access(I) << 24; va_validate_register_pair(I, 1); - hex |= (uint64_t) va_pack_src(I, 1) << 0; + hex |= (uint64_t)va_pack_src(I, 1) << 0; hex |= va_pack_byte_offset(I); @@ -615,11 +720,16 @@ static enum va_lod_mode va_pack_lod_mode(const bi_instr *I) { switch (I->va_lod_mode) { - case BI_VA_LOD_MODE_ZERO_LOD: return VA_LOD_MODE_ZERO; - case BI_VA_LOD_MODE_COMPUTED_LOD: return VA_LOD_MODE_COMPUTED; - case BI_VA_LOD_MODE_EXPLICIT: return VA_LOD_MODE_EXPLICIT; - case BI_VA_LOD_MODE_COMPUTED_BIAS: return VA_LOD_MODE_COMPUTED_BIAS; - case BI_VA_LOD_MODE_GRDESC: return VA_LOD_MODE_GRDESC; + case BI_VA_LOD_MODE_ZERO_LOD: + return VA_LOD_MODE_ZERO; + case BI_VA_LOD_MODE_COMPUTED_LOD: + return VA_LOD_MODE_COMPUTED; + case BI_VA_LOD_MODE_EXPLICIT: + return VA_LOD_MODE_EXPLICIT; + case BI_VA_LOD_MODE_COMPUTED_BIAS: + return VA_LOD_MODE_COMPUTED_BIAS; + case BI_VA_LOD_MODE_GRDESC: + return VA_LOD_MODE_GRDESC; } invalid_instruction(I, "LOD mode"); @@ -650,14 +760,22 @@ static enum va_register_format va_pack_register_format(const bi_instr *I) { switch (I->register_format) { - case BI_REGISTER_FORMAT_AUTO: return VA_REGISTER_FORMAT_AUTO; - case BI_REGISTER_FORMAT_F32: return VA_REGISTER_FORMAT_F32; - case BI_REGISTER_FORMAT_F16: return VA_REGISTER_FORMAT_F16; - case BI_REGISTER_FORMAT_S32: return VA_REGISTER_FORMAT_S32; - case BI_REGISTER_FORMAT_S16: return VA_REGISTER_FORMAT_S16; - case BI_REGISTER_FORMAT_U32: return VA_REGISTER_FORMAT_U32; - case BI_REGISTER_FORMAT_U16: return VA_REGISTER_FORMAT_U16; - default: invalid_instruction(I, "register format"); + case BI_REGISTER_FORMAT_AUTO: + return VA_REGISTER_FORMAT_AUTO; + case BI_REGISTER_FORMAT_F32: + return VA_REGISTER_FORMAT_F32; + case BI_REGISTER_FORMAT_F16: + return VA_REGISTER_FORMAT_F16; + case BI_REGISTER_FORMAT_S32: + return VA_REGISTER_FORMAT_S32; + case BI_REGISTER_FORMAT_S16: + return VA_REGISTER_FORMAT_S16; + case BI_REGISTER_FORMAT_U32: + return VA_REGISTER_FORMAT_U32; + case BI_REGISTER_FORMAT_U16: + return VA_REGISTER_FORMAT_U16; + default: + invalid_instruction(I, "register format"); } } @@ -666,35 +784,34 @@ va_pack_instr(const bi_instr *I) { struct va_opcode_info info = valhall_opcodes[I->op]; - uint64_t hex = info.exact | (((uint64_t) I->flow) << 59); - hex |= ((uint64_t) va_select_fau_page(I)) << 57; + uint64_t hex = info.exact | (((uint64_t)I->flow) << 59); + hex |= ((uint64_t)va_select_fau_page(I)) << 57; if (info.slot) - hex |= ((uint64_t) I->slot << 30); + hex |= ((uint64_t)I->slot << 30); if (info.sr_count) { bool read = bi_opcode_props[I->op].sr_read; bi_index sr = read ? I->src[0] : I->dest[0]; - unsigned count = read ? - bi_count_read_registers(I, 0) : - bi_count_write_registers(I, 0); + unsigned count = + read ? bi_count_read_registers(I, 0) : bi_count_write_registers(I, 0); - hex |= ((uint64_t) count << 33); - hex |= (uint64_t) va_pack_reg(I, sr) << 40; - hex |= ((uint64_t) info.sr_control << 46); + hex |= ((uint64_t)count << 33); + hex |= (uint64_t)va_pack_reg(I, sr) << 40; + hex |= ((uint64_t)info.sr_control << 46); } if (info.sr_write_count) { - hex |= ((uint64_t) bi_count_write_registers(I, 0) - 1) << 36; - hex |= ((uint64_t) va_pack_reg(I, I->dest[0])) << 16; + hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) << 36; + hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16; } if (info.vecsize) - hex |= ((uint64_t) I->vecsize << 28); + hex |= ((uint64_t)I->vecsize << 28); if (info.register_format) - hex |= ((uint64_t) va_pack_register_format(I)) << 24; + hex |= ((uint64_t)va_pack_register_format(I)) << 24; switch (I->op) { case BI_OPCODE_LOAD_I8: @@ -738,18 +855,18 @@ va_pack_instr(const bi_instr *I) /* 64-bit source */ va_validate_register_pair(I, 0); - hex |= (uint64_t) va_pack_src(I, 0) << 0; + hex |= (uint64_t)va_pack_src(I, 0) << 0; hex |= va_pack_byte_offset_8(I); - hex |= ((uint64_t) va_pack_atom_opc_1(I)) << 22; + hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22; break; case BI_OPCODE_ATOM_I32: case BI_OPCODE_ATOM_RETURN_I32: /* 64-bit source */ va_validate_register_pair(I, 1); - hex |= (uint64_t) va_pack_src(I, 1) << 0; + hex |= (uint64_t)va_pack_src(I, 1) << 0; hex |= va_pack_byte_offset_8(I); - hex |= ((uint64_t) va_pack_atom_opc(I)) << 22; + hex |= ((uint64_t)va_pack_atom_opc(I)) << 22; if (I->op == BI_OPCODE_ATOM_RETURN_I32) hex |= (0xc0ull << 40); // flags @@ -764,56 +881,61 @@ va_pack_instr(const bi_instr *I) hex |= va_pack_store(I); /* Conversion descriptor */ - hex |= (uint64_t) va_pack_src(I, 3) << 16; + hex |= (uint64_t)va_pack_src(I, 3) << 16; break; - case BI_OPCODE_BLEND: - { + case BI_OPCODE_BLEND: { /* Source 0 - Blend descriptor (64-bit) */ - hex |= ((uint64_t) va_pack_src(I, 2)) << 0; + hex |= ((uint64_t)va_pack_src(I, 2)) << 0; va_validate_register_pair(I, 2); /* Target */ - if (I->branch_offset & 0x7) invalid_instruction(I, "unaligned branch"); + if (I->branch_offset & 0x7) + invalid_instruction(I, "unaligned branch"); hex |= ((I->branch_offset >> 3) << 8); /* Source 2 - coverage mask */ - hex |= ((uint64_t) va_pack_reg(I, I->src[1])) << 16; + hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16; /* Vector size */ unsigned vecsize = 4; - hex |= ((uint64_t) (vecsize - 1) << 28); + hex |= ((uint64_t)(vecsize - 1) << 28); break; } case BI_OPCODE_TEX_SINGLE: case BI_OPCODE_TEX_FETCH: - case BI_OPCODE_TEX_GATHER: - { + case BI_OPCODE_TEX_GATHER: { /* Image to read from */ - hex |= ((uint64_t) va_pack_src(I, 1)) << 0; + hex |= ((uint64_t)va_pack_src(I, 1)) << 0; if (I->op == BI_OPCODE_TEX_FETCH && I->shadow) invalid_instruction(I, "TEX_FETCH does not support .shadow"); - if (I->array_enable) hex |= (1ull << 10); - if (I->texel_offset) hex |= (1ull << 11); - if (I->shadow) hex |= (1ull << 12); - if (I->skip) hex |= (1ull << 39); - if (!bi_is_regfmt_16(I->register_format)) hex |= (1ull << 46); + if (I->array_enable) + hex |= (1ull << 10); + if (I->texel_offset) + hex |= (1ull << 11); + if (I->shadow) + hex |= (1ull << 12); + if (I->skip) + hex |= (1ull << 39); + if (!bi_is_regfmt_16(I->register_format)) + hex |= (1ull << 46); if (I->op == BI_OPCODE_TEX_SINGLE) - hex |= ((uint64_t) va_pack_lod_mode(I)) << 13; + hex |= ((uint64_t)va_pack_lod_mode(I)) << 13; if (I->op == BI_OPCODE_TEX_GATHER) { - if (I->integer_coordinates) hex |= (1 << 13); - hex |= ((uint64_t) I->fetch_component) << 14; + if (I->integer_coordinates) + hex |= (1 << 13); + hex |= ((uint64_t)I->fetch_component) << 14; } hex |= (I->write_mask << 22); - hex |= ((uint64_t) va_pack_register_type(I)) << 26; - hex |= ((uint64_t) I->dimension) << 28; + hex |= ((uint64_t)va_pack_register_type(I)) << 26; + hex |= ((uint64_t)I->dimension) << 28; break; } diff --git a/src/panfrost/bifrost/valhall/va_perf.c b/src/panfrost/bifrost/valhall/va_perf.c index 7175302bf25..29a1424162f 100644 --- a/src/panfrost/bifrost/valhall/va_perf.c +++ b/src/panfrost/bifrost/valhall/va_perf.c @@ -22,9 +22,9 @@ * SOFTWARE. */ +#include "bi_builder.h" #include "va_compiler.h" #include "valhall.h" -#include "bi_builder.h" void va_count_instr_stats(bi_instr *I, struct va_stats *stats) @@ -48,8 +48,8 @@ va_count_instr_stats(bi_instr *I, struct va_stats *stats) /* Varying is scaled by 16-bit components interpolated */ case VA_UNIT_V: - stats->v += (I->vecsize + 1) * - (bi_is_regfmt_16(I->register_format) ? 1 : 2); + stats->v += + (I->vecsize + 1) * (bi_is_regfmt_16(I->register_format) ? 1 : 2); return; /* We just count load/store and texturing for now */ diff --git a/src/panfrost/bifrost/valhall/va_validate.c b/src/panfrost/bifrost/valhall/va_validate.c index 847a92a0867..0afa3dfccac 100644 --- a/src/panfrost/bifrost/valhall/va_validate.c +++ b/src/panfrost/bifrost/valhall/va_validate.c @@ -21,15 +21,16 @@ * SOFTWARE. */ +#include "bi_builder.h" #include "va_compiler.h" #include "valhall.h" -#include "bi_builder.h" /* Valhall has limits on access to fast-access uniforms: * * An instruction may access no more than a single 64-bit uniform slot. - * An instruction may access no more than 64-bits of combined uniforms and constants. - * An instruction may access no more than a single special immediate (e.g. lane_id). + * An instruction may access no more than 64-bits of combined uniforms and + * constants. An instruction may access no more than a single special immediate + * (e.g. lane_id). * * We validate these constraints. * @@ -114,7 +115,7 @@ bool va_validate_fau(bi_instr *I) { bool valid = true; - struct fau_state fau = { .uniform_slot = -1 }; + struct fau_state fau = {.uniform_slot = -1}; unsigned fau_page = va_select_fau_page(I); bi_foreach_src(I, s) { @@ -127,7 +128,7 @@ va_validate_fau(bi_instr *I) void va_repair_fau(bi_builder *b, bi_instr *I) { - struct fau_state fau = { .uniform_slot = -1 }; + struct fau_state fau = {.uniform_slot = -1}; unsigned fau_page = va_select_fau_page(I); bi_foreach_src(I, s) { diff --git a/src/panfrost/bifrost/valhall/valhall.h b/src/panfrost/bifrost/valhall/valhall.h index f3fcc1ce435..14442946664 100644 --- a/src/panfrost/bifrost/valhall/valhall.h +++ b/src/panfrost/bifrost/valhall/valhall.h @@ -73,43 +73,42 @@ enum va_unit { }; struct va_src_info { - bool absneg : 1; - bool swizzle : 1; - bool notted : 1; - bool lane : 1; - bool lanes : 1; - bool halfswizzle : 1; - bool widen : 1; - bool combine : 1; + bool absneg : 1; + bool swizzle : 1; + bool notted : 1; + bool lane : 1; + bool lanes : 1; + bool halfswizzle : 1; + bool widen : 1; + bool combine : 1; enum va_size size : 2; } __attribute__((packed)); struct va_opcode_info { uint64_t exact; struct va_src_info srcs[4]; - uint8_t type_size : 8; - enum va_unit unit : 3; - unsigned nr_srcs : 3; - unsigned nr_staging_srcs : 2; + uint8_t type_size : 8; + enum va_unit unit : 3; + unsigned nr_srcs : 3; + unsigned nr_staging_srcs : 2; unsigned nr_staging_dests : 2; - bool has_dest : 1; - bool is_signed : 1; - bool clamp : 1; - bool saturate : 1; - bool rhadd : 1; - bool round_mode : 1; - bool condition : 1; - bool result_type : 1; - bool vecsize : 1; - bool register_format : 1; - bool slot : 1; - bool sr_count : 1; - bool sr_write_count : 1; - unsigned sr_control : 2; + bool has_dest : 1; + bool is_signed : 1; + bool clamp : 1; + bool saturate : 1; + bool rhadd : 1; + bool round_mode : 1; + bool condition : 1; + bool result_type : 1; + bool vecsize : 1; + bool register_format : 1; + bool slot : 1; + bool sr_count : 1; + bool sr_write_count : 1; + unsigned sr_control : 2; }; -extern const struct va_opcode_info -valhall_opcodes[BI_NUM_OPCODES]; +extern const struct va_opcode_info valhall_opcodes[BI_NUM_OPCODES]; /* Bifrost specifies the source of bitwise operations as (A, B, shift), but * Valhall specifies (A, shift, B). We follow Bifrost conventions in the diff --git a/src/panfrost/drm-shim/panfrost_noop.c b/src/panfrost/drm-shim/panfrost_noop.c index 1b3d50c5e5d..bf3e97d17fb 100644 --- a/src/panfrost/drm-shim/panfrost_noop.c +++ b/src/panfrost/drm-shim/panfrost_noop.c @@ -47,8 +47,7 @@ pan_ioctl_get_param(int fd, unsigned long request, void *arg) struct drm_panfrost_get_param *gp = arg; switch (gp->param) { - case DRM_PANFROST_PARAM_GPU_PROD_ID: - { + case DRM_PANFROST_PARAM_GPU_PROD_ID: { char *override_version = getenv("PAN_GPU_ID"); if (override_version) diff --git a/src/panfrost/ds/pan_pps_driver.h b/src/panfrost/ds/pan_pps_driver.h index f6476d9dee3..9392b9a5673 100644 --- a/src/panfrost/ds/pan_pps_driver.h +++ b/src/panfrost/ds/pan_pps_driver.h @@ -13,22 +13,21 @@ #include "pan_pps_perf.h" -namespace pps -{ +namespace pps { /// @brief Panfrost implementation of PPS driver. -/// This driver queries the GPU through `drm/panfrost_drm.h`, using performance counters ioctls, -/// which can be enabled by setting a kernel parameter: `modprobe panfrost unstable_ioctls=1`. -/// The ioctl needs a buffer to copy data from kernel to user space. -class PanfrostDriver : public Driver -{ - public: +/// This driver queries the GPU through `drm/panfrost_drm.h`, using performance +/// counters ioctls, which can be enabled by setting a kernel parameter: +/// `modprobe panfrost unstable_ioctls=1`. The ioctl needs a buffer to copy data +/// from kernel to user space. +class PanfrostDriver : public Driver { + public: static inline PanfrostDriver &into(Driver &dri); static inline const PanfrostDriver &into(const Driver &dri); /// @param A list of mali counter names /// @return A pair with two lists: counter groups and available counters - static std::pair, std::vector> create_available_counters( - const PanfrostPerf& perf); + static std::pair, std::vector> + create_available_counters(const PanfrostPerf &perf); PanfrostDriver(); ~PanfrostDriver(); @@ -50,12 +49,14 @@ class PanfrostDriver : public Driver std::unique_ptr perf = nullptr; }; -PanfrostDriver &PanfrostDriver::into(Driver &dri) +PanfrostDriver & +PanfrostDriver::into(Driver &dri) { return reinterpret_cast(dri); } -const PanfrostDriver &PanfrostDriver::into(const Driver &dri) +const PanfrostDriver & +PanfrostDriver::into(const Driver &dri) { return reinterpret_cast(dri); } diff --git a/src/panfrost/ds/pan_pps_perf.h b/src/panfrost/ds/pan_pps_perf.h index 48ae2f58e53..c046e09b0df 100644 --- a/src/panfrost/ds/pan_pps_perf.h +++ b/src/panfrost/ds/pan_pps_perf.h @@ -10,35 +10,32 @@ struct panfrost_device; struct panfrost_perf; -namespace pps -{ -class PanfrostDevice -{ - public: +namespace pps { +class PanfrostDevice { + public: PanfrostDevice(int fd); ~PanfrostDevice(); PanfrostDevice(const PanfrostDevice &) = delete; PanfrostDevice &operator=(const PanfrostDevice &) = delete; - PanfrostDevice(PanfrostDevice&&); - PanfrostDevice& operator=(PanfrostDevice&&); + PanfrostDevice(PanfrostDevice &&); + PanfrostDevice &operator=(PanfrostDevice &&); void *ctx = nullptr; - struct panfrost_device* dev = nullptr; + struct panfrost_device *dev = nullptr; }; -class PanfrostPerf -{ - public: - PanfrostPerf(const PanfrostDevice& dev); +class PanfrostPerf { + public: + PanfrostPerf(const PanfrostDevice &dev); ~PanfrostPerf(); PanfrostPerf(const PanfrostPerf &) = delete; PanfrostPerf &operator=(const PanfrostPerf &) = delete; - PanfrostPerf(PanfrostPerf&&); - PanfrostPerf& operator=(PanfrostPerf&&); + PanfrostPerf(PanfrostPerf &&); + PanfrostPerf &operator=(PanfrostPerf &&); int enable() const; void disable() const; diff --git a/src/panfrost/include/panfrost-job.h b/src/panfrost/include/panfrost-job.h index 19e537fcebf..fe09389f630 100644 --- a/src/panfrost/include/panfrost-job.h +++ b/src/panfrost/include/panfrost-job.h @@ -28,11 +28,11 @@ #ifndef __PANFROST_JOB_H__ #define __PANFROST_JOB_H__ -#include -#include #include +#include +#include -typedef uint8_t u8; +typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32; typedef uint64_t u64; @@ -68,13 +68,13 @@ typedef uint64_t mali_ptr; /* These formats seem to largely duplicate the others. They're used at least * for Bifrost framebuffer output. */ -#define MALI_FORMAT_SPECIAL2 (7 << 5) -#define MALI_EXTRACT_TYPE(fmt) ((fmt) & 0xe0) +#define MALI_FORMAT_SPECIAL2 (7 << 5) +#define MALI_EXTRACT_TYPE(fmt) ((fmt)&0xe0) /* If the high 3 bits are 3 to 6 these two bits say how many components * there are. */ -#define MALI_NR_CHANNELS(n) ((n - 1) << 3) +#define MALI_NR_CHANNELS(n) ((n - 1) << 3) #define MALI_EXTRACT_CHANNELS(fmt) ((((fmt) >> 3) & 3) + 1) /* If the high 3 bits are 3 to 6, then the low 3 bits say how big each @@ -93,7 +93,7 @@ typedef uint64_t mali_ptr; /* For MALI_FORMAT_SINT it means a half-float (e.g. RG16F). For * MALI_FORMAT_UNORM, it means a 32-bit float. */ -#define MALI_CHANNEL_FLOAT 7 +#define MALI_CHANNEL_FLOAT 7 #define MALI_EXTRACT_BITS(fmt) (fmt & 0x7) #define MALI_EXTRACT_INDEX(pixfmt) (((pixfmt) >> 12) & 0xFF) @@ -241,18 +241,18 @@ typedef uint64_t mali_ptr; /* Used for lod encoding. Thanks @urjaman for pointing out these routines can * be cleaned up a lot. */ -#define DECODE_FIXED_16(x) ((float) (x / 256.0)) +#define DECODE_FIXED_16(x) ((float)(x / 256.0)) static inline int16_t FIXED_16(float x, bool allow_negative) { - /* Clamp inputs, accounting for float error */ - float max_lod = (32.0 - (1.0 / 512.0)); - float min_lod = allow_negative ? -max_lod : 0.0; + /* Clamp inputs, accounting for float error */ + float max_lod = (32.0 - (1.0 / 512.0)); + float min_lod = allow_negative ? -max_lod : 0.0; - x = ((x > max_lod) ? max_lod : ((x < min_lod) ? min_lod : x)); + x = ((x > max_lod) ? max_lod : ((x < min_lod) ? min_lod : x)); - return (int) (x * 256.0); + return (int)(x * 256.0); } #endif /* __PANFROST_JOB_H__ */ diff --git a/src/panfrost/lib/genxml/decode.c b/src/panfrost/lib/genxml/decode.c index 9f942505e04..aa92af16d2e 100644 --- a/src/panfrost/lib/genxml/decode.c +++ b/src/panfrost/lib/genxml/decode.c @@ -23,73 +23,77 @@ * SOFTWARE. */ -#include +#include "decode.h" +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include -#include "decode.h" +#include -#include "util/set.h" -#include "midgard/disassemble.h" #include "bifrost/disassemble.h" #include "bifrost/valhall/disassemble.h" +#include "midgard/disassemble.h" +#include "util/set.h" -#define DUMP_UNPACKED(T, var, ...) { \ - pandecode_log(__VA_ARGS__); \ - pan_print(pandecode_dump_stream, T, var, (pandecode_indent + 1) * 2); \ -} +#define DUMP_UNPACKED(T, var, ...) \ + { \ + pandecode_log(__VA_ARGS__); \ + pan_print(pandecode_dump_stream, T, var, (pandecode_indent + 1) * 2); \ + } -#define DUMP_CL(T, cl, ...) {\ - pan_unpack(cl, T, temp); \ - DUMP_UNPACKED(T, temp, __VA_ARGS__); \ -} +#define DUMP_CL(T, cl, ...) \ + { \ + pan_unpack(cl, T, temp); \ + DUMP_UNPACKED(T, temp, __VA_ARGS__); \ + } -#define DUMP_SECTION(A, S, cl, ...) { \ - pan_section_unpack(cl, A, S, temp); \ - pandecode_log(__VA_ARGS__); \ - pan_section_print(pandecode_dump_stream, A, S, temp, (pandecode_indent + 1) * 2); \ -} +#define DUMP_SECTION(A, S, cl, ...) \ + { \ + pan_section_unpack(cl, A, S, temp); \ + pandecode_log(__VA_ARGS__); \ + pan_section_print(pandecode_dump_stream, A, S, temp, \ + (pandecode_indent + 1) * 2); \ + } -#define MAP_ADDR(T, addr, cl) \ - const uint8_t *cl = pandecode_fetch_gpu_mem(addr, pan_size(T)); +#define MAP_ADDR(T, addr, cl) \ + const uint8_t *cl = pandecode_fetch_gpu_mem(addr, pan_size(T)); -#define DUMP_ADDR(T, addr, ...) {\ - MAP_ADDR(T, addr, cl) \ - DUMP_CL(T, cl, __VA_ARGS__); \ -} +#define DUMP_ADDR(T, addr, ...) \ + { \ + MAP_ADDR(T, addr, cl) \ + DUMP_CL(T, cl, __VA_ARGS__); \ + } static unsigned pandecode_indent = 0; static void pandecode_make_indent(void) { - for (unsigned i = 0; i < pandecode_indent; ++i) - fprintf(pandecode_dump_stream, " "); + for (unsigned i = 0; i < pandecode_indent; ++i) + fprintf(pandecode_dump_stream, " "); } -static void PRINTFLIKE(1, 2) -pandecode_log(const char *format, ...) +static void PRINTFLIKE(1, 2) pandecode_log(const char *format, ...) { - va_list ap; + va_list ap; - pandecode_make_indent(); - va_start(ap, format); - vfprintf(pandecode_dump_stream, format, ap); - va_end(ap); + pandecode_make_indent(); + va_start(ap, format); + vfprintf(pandecode_dump_stream, format, ap); + va_end(ap); } static void pandecode_log_cont(const char *format, ...) { - va_list ap; + va_list ap; - va_start(ap, format); - vfprintf(pandecode_dump_stream, format, ap); - va_end(ap); + va_start(ap, format); + vfprintf(pandecode_dump_stream, format, ap); + va_end(ap); } /* To check for memory safety issues, validates that the given pointer in GPU @@ -101,33 +105,33 @@ pandecode_log_cont(const char *format, ...) static void pandecode_validate_buffer(mali_ptr addr, size_t sz) { - if (!addr) { - pandecode_log("// XXX: null pointer deref\n"); - return; - } + if (!addr) { + pandecode_log("// XXX: null pointer deref\n"); + return; + } - /* Find a BO */ + /* Find a BO */ - struct pandecode_mapped_memory *bo = - pandecode_find_mapped_gpu_mem_containing(addr); + struct pandecode_mapped_memory *bo = + pandecode_find_mapped_gpu_mem_containing(addr); - if (!bo) { - pandecode_log("// XXX: invalid memory dereference\n"); - return; - } + if (!bo) { + pandecode_log("// XXX: invalid memory dereference\n"); + return; + } - /* Bounds check */ + /* Bounds check */ - unsigned offset = addr - bo->gpu_va; - unsigned total = offset + sz; + unsigned offset = addr - bo->gpu_va; + unsigned total = offset + sz; - if (total > bo->length) { - pandecode_log("// XXX: buffer overrun. " - "Chunk of size %zu at offset %d in buffer of size %zu. " - "Overrun by %zu bytes. \n", - sz, offset, bo->length, total - bo->length); - return; - } + if (total > bo->length) { + pandecode_log("// XXX: buffer overrun. " + "Chunk of size %zu at offset %d in buffer of size %zu. " + "Overrun by %zu bytes. \n", + sz, offset, bo->length, total - bo->length); + return; + } } #if PAN_ARCH <= 5 @@ -135,28 +139,27 @@ pandecode_validate_buffer(mali_ptr addr, size_t sz) * larger FBD */ static void -pandecode_midgard_tiler_descriptor( - const struct mali_tiler_context_packed *tp, - const struct mali_tiler_weights_packed *wp) +pandecode_midgard_tiler_descriptor(const struct mali_tiler_context_packed *tp, + const struct mali_tiler_weights_packed *wp) { - pan_unpack(tp, TILER_CONTEXT, t); - DUMP_UNPACKED(TILER_CONTEXT, t, "Tiler:\n"); + pan_unpack(tp, TILER_CONTEXT, t); + DUMP_UNPACKED(TILER_CONTEXT, t, "Tiler:\n"); - /* We've never seen weights used in practice, but they exist */ - pan_unpack(wp, TILER_WEIGHTS, w); - bool nonzero_weights = false; + /* We've never seen weights used in practice, but they exist */ + pan_unpack(wp, TILER_WEIGHTS, w); + bool nonzero_weights = false; - nonzero_weights |= w.weight0 != 0x0; - nonzero_weights |= w.weight1 != 0x0; - nonzero_weights |= w.weight2 != 0x0; - nonzero_weights |= w.weight3 != 0x0; - nonzero_weights |= w.weight4 != 0x0; - nonzero_weights |= w.weight5 != 0x0; - nonzero_weights |= w.weight6 != 0x0; - nonzero_weights |= w.weight7 != 0x0; + nonzero_weights |= w.weight0 != 0x0; + nonzero_weights |= w.weight1 != 0x0; + nonzero_weights |= w.weight2 != 0x0; + nonzero_weights |= w.weight3 != 0x0; + nonzero_weights |= w.weight4 != 0x0; + nonzero_weights |= w.weight5 != 0x0; + nonzero_weights |= w.weight6 != 0x0; + nonzero_weights |= w.weight7 != 0x0; - if (nonzero_weights) - DUMP_UNPACKED(TILER_WEIGHTS, w, "Tiler Weights:\n"); + if (nonzero_weights) + DUMP_UNPACKED(TILER_WEIGHTS, w, "Tiler Weights:\n"); } #endif @@ -164,25 +167,27 @@ pandecode_midgard_tiler_descriptor( static void pandecode_local_storage(uint64_t gpu_va) { - const struct mali_local_storage_packed *PANDECODE_PTR_VAR(s, (mali_ptr) gpu_va); - DUMP_CL(LOCAL_STORAGE, s, "Local Storage:\n"); + const struct mali_local_storage_packed *PANDECODE_PTR_VAR(s, + (mali_ptr)gpu_va); + DUMP_CL(LOCAL_STORAGE, s, "Local Storage:\n"); } static void pandecode_render_target(uint64_t gpu_va, unsigned gpu_id, const struct MALI_FRAMEBUFFER_PARAMETERS *fb) { - pandecode_log("Color Render Targets:\n"); - pandecode_indent++; + pandecode_log("Color Render Targets:\n"); + pandecode_indent++; - for (int i = 0; i < (fb->render_target_count); i++) { - mali_ptr rt_va = gpu_va + i * pan_size(RENDER_TARGET); - const struct mali_render_target_packed *PANDECODE_PTR_VAR(rtp, (mali_ptr) rt_va); - DUMP_CL(RENDER_TARGET, rtp, "Color Render Target %d:\n", i); - } + for (int i = 0; i < (fb->render_target_count); i++) { + mali_ptr rt_va = gpu_va + i * pan_size(RENDER_TARGET); + const struct mali_render_target_packed *PANDECODE_PTR_VAR( + rtp, (mali_ptr)rt_va); + DUMP_CL(RENDER_TARGET, rtp, "Color Render Target %d:\n", i); + } - pandecode_indent--; - pandecode_log("\n"); + pandecode_indent--; + pandecode_log("\n"); } #endif @@ -190,156 +195,158 @@ pandecode_render_target(uint64_t gpu_va, unsigned gpu_id, static void pandecode_sample_locations(const void *fb) { - pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params); + pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params); - const u16 *PANDECODE_PTR_VAR(samples, params.sample_locations); + const u16 *PANDECODE_PTR_VAR(samples, params.sample_locations); - pandecode_log("Sample locations:\n"); - for (int i = 0; i < 33; i++) { - pandecode_log(" (%d, %d),\n", - samples[2 * i] - 128, - samples[2 * i + 1] - 128); - } + pandecode_log("Sample locations:\n"); + for (int i = 0; i < 33; i++) { + pandecode_log(" (%d, %d),\n", samples[2 * i] - 128, + samples[2 * i + 1] - 128); + } } #endif -static void -pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, - unsigned gpu_id); +static void pandecode_dcd(const struct MALI_DRAW *p, + enum mali_job_type job_type, unsigned gpu_id); /* Information about the framebuffer passed back for additional analysis */ struct pandecode_fbd { - unsigned rt_count; - bool has_extra; + unsigned rt_count; + bool has_extra; }; static struct pandecode_fbd pandecode_fbd(uint64_t gpu_va, bool is_fragment, unsigned gpu_id) { #if PAN_ARCH >= 5 - /* We only see MFBDs on architectures that support them */ - assert(gpu_va & MALI_FBD_TAG_IS_MFBD); - gpu_va &= ~MALI_FBD_TAG_MASK; + /* We only see MFBDs on architectures that support them */ + assert(gpu_va & MALI_FBD_TAG_IS_MFBD); + gpu_va &= ~MALI_FBD_TAG_MASK; #endif - const void *PANDECODE_PTR_VAR(fb, (mali_ptr) gpu_va); - pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params); + const void *PANDECODE_PTR_VAR(fb, (mali_ptr)gpu_va); + pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params); #if PAN_ARCH >= 6 - pandecode_sample_locations(fb); + pandecode_sample_locations(fb); - unsigned dcd_size = pan_size(DRAW); + unsigned dcd_size = pan_size(DRAW); - if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { - const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (0 * dcd_size)); - pan_unpack(dcd, DRAW, draw); - pandecode_log("Pre frame 0:\n"); - pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); - } + if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const void *PANDECODE_PTR_VAR(dcd, + params.frame_shader_dcds + (0 * dcd_size)); + pan_unpack(dcd, DRAW, draw); + pandecode_log("Pre frame 0:\n"); + pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + } - if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { - const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (1 * dcd_size)); - pan_unpack(dcd, DRAW, draw); - pandecode_log("Pre frame 1:\n"); - pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); - } + if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const void *PANDECODE_PTR_VAR(dcd, + params.frame_shader_dcds + (1 * dcd_size)); + pan_unpack(dcd, DRAW, draw); + pandecode_log("Pre frame 1:\n"); + pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + } - if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { - const void *PANDECODE_PTR_VAR(dcd, params.frame_shader_dcds + (2 * dcd_size)); - pan_unpack(dcd, DRAW, draw); - pandecode_log("Post frame:\n"); - pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); - } + if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) { + const void *PANDECODE_PTR_VAR(dcd, + params.frame_shader_dcds + (2 * dcd_size)); + pan_unpack(dcd, DRAW, draw); + pandecode_log("Post frame:\n"); + pandecode_dcd(&draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + } #else - DUMP_SECTION(FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n"); + DUMP_SECTION(FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n"); - const void *t = pan_section_ptr(fb, FRAMEBUFFER, TILER); - const void *w = pan_section_ptr(fb, FRAMEBUFFER, TILER_WEIGHTS); - pandecode_midgard_tiler_descriptor(t, w); + const void *t = pan_section_ptr(fb, FRAMEBUFFER, TILER); + const void *w = pan_section_ptr(fb, FRAMEBUFFER, TILER_WEIGHTS); + pandecode_midgard_tiler_descriptor(t, w); #endif - pandecode_log("Framebuffer:\n"); - pandecode_indent++; + pandecode_log("Framebuffer:\n"); + pandecode_indent++; - DUMP_UNPACKED(FRAMEBUFFER_PARAMETERS, params, "Parameters:\n"); + DUMP_UNPACKED(FRAMEBUFFER_PARAMETERS, params, "Parameters:\n"); - pandecode_indent--; - pandecode_log("\n"); + pandecode_indent--; + pandecode_log("\n"); #if PAN_ARCH >= 5 - gpu_va += pan_size(FRAMEBUFFER); + gpu_va += pan_size(FRAMEBUFFER); - if (params.has_zs_crc_extension) { - const struct mali_zs_crc_extension_packed *PANDECODE_PTR_VAR(zs_crc, (mali_ptr)gpu_va); - DUMP_CL(ZS_CRC_EXTENSION, zs_crc, "ZS CRC Extension:\n"); - pandecode_log("\n"); + if (params.has_zs_crc_extension) { + const struct mali_zs_crc_extension_packed *PANDECODE_PTR_VAR( + zs_crc, (mali_ptr)gpu_va); + DUMP_CL(ZS_CRC_EXTENSION, zs_crc, "ZS CRC Extension:\n"); + pandecode_log("\n"); - gpu_va += pan_size(ZS_CRC_EXTENSION); - } + gpu_va += pan_size(ZS_CRC_EXTENSION); + } - if (is_fragment) - pandecode_render_target(gpu_va, gpu_id, ¶ms); + if (is_fragment) + pandecode_render_target(gpu_va, gpu_id, ¶ms); - return (struct pandecode_fbd) { - .rt_count = params.render_target_count, - .has_extra = params.has_zs_crc_extension, - }; + return (struct pandecode_fbd){ + .rt_count = params.render_target_count, + .has_extra = params.has_zs_crc_extension, + }; #else - /* Dummy unpack of the padding section to make sure all words are 0. - * No need to call print here since the section is supposed to be empty. - */ - pan_section_unpack(fb, FRAMEBUFFER, PADDING_1, padding1); - pan_section_unpack(fb, FRAMEBUFFER, PADDING_2, padding2); + /* Dummy unpack of the padding section to make sure all words are 0. + * No need to call print here since the section is supposed to be empty. + */ + pan_section_unpack(fb, FRAMEBUFFER, PADDING_1, padding1); + pan_section_unpack(fb, FRAMEBUFFER, PADDING_2, padding2); - return (struct pandecode_fbd) { - .rt_count = 1, - }; + return (struct pandecode_fbd){ + .rt_count = 1, + }; #endif } #if PAN_ARCH <= 7 static void -pandecode_attributes(mali_ptr addr, int count, - bool varying, enum mali_job_type job_type) +pandecode_attributes(mali_ptr addr, int count, bool varying, + enum mali_job_type job_type) { - char *prefix = varying ? "Varying" : "Attribute"; - assert(addr); + char *prefix = varying ? "Varying" : "Attribute"; + assert(addr); - if (!count) { - pandecode_log("// warn: No %s records\n", prefix); - return; - } + if (!count) { + pandecode_log("// warn: No %s records\n", prefix); + return; + } - MAP_ADDR(ATTRIBUTE_BUFFER, addr, cl); + MAP_ADDR(ATTRIBUTE_BUFFER, addr, cl); - for (int i = 0; i < count; ++i) { - pan_unpack(cl + i * pan_size(ATTRIBUTE_BUFFER), ATTRIBUTE_BUFFER, temp); - DUMP_UNPACKED(ATTRIBUTE_BUFFER, temp, "%s:\n", prefix); + for (int i = 0; i < count; ++i) { + pan_unpack(cl + i * pan_size(ATTRIBUTE_BUFFER), ATTRIBUTE_BUFFER, temp); + DUMP_UNPACKED(ATTRIBUTE_BUFFER, temp, "%s:\n", prefix); - switch (temp.type) { - case MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR_WRITE_REDUCTION: - case MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR: { - pan_unpack(cl + (i + 1) * pan_size(ATTRIBUTE_BUFFER), - ATTRIBUTE_BUFFER_CONTINUATION_NPOT, temp2); - pan_print(pandecode_dump_stream, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, - temp2, (pandecode_indent + 1) * 2); - i++; - break; - } - case MALI_ATTRIBUTE_TYPE_3D_LINEAR: - case MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED: { - pan_unpack(cl + (i + 1) * pan_size(ATTRIBUTE_BUFFER_CONTINUATION_3D), - ATTRIBUTE_BUFFER_CONTINUATION_3D, temp2); - pan_print(pandecode_dump_stream, ATTRIBUTE_BUFFER_CONTINUATION_3D, - temp2, (pandecode_indent + 1) * 2); - i++; - break; - } - default: - break; - } - } - pandecode_log("\n"); + switch (temp.type) { + case MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR_WRITE_REDUCTION: + case MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR: { + pan_unpack(cl + (i + 1) * pan_size(ATTRIBUTE_BUFFER), + ATTRIBUTE_BUFFER_CONTINUATION_NPOT, temp2); + pan_print(pandecode_dump_stream, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, + temp2, (pandecode_indent + 1) * 2); + i++; + break; + } + case MALI_ATTRIBUTE_TYPE_3D_LINEAR: + case MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED: { + pan_unpack(cl + (i + 1) * pan_size(ATTRIBUTE_BUFFER_CONTINUATION_3D), + ATTRIBUTE_BUFFER_CONTINUATION_3D, temp2); + pan_print(pandecode_dump_stream, ATTRIBUTE_BUFFER_CONTINUATION_3D, + temp2, (pandecode_indent + 1) * 2); + i++; + break; + } + default: + break; + } + } + pandecode_log("\n"); } #endif @@ -347,15 +354,15 @@ pandecode_attributes(mali_ptr addr, int count, static mali_ptr pandecode_blend(void *descs, int rt_no, mali_ptr frag_shader) { - pan_unpack(descs + (rt_no * pan_size(BLEND)), BLEND, b); - DUMP_UNPACKED(BLEND, b, "Blend RT %d:\n", rt_no); + pan_unpack(descs + (rt_no * pan_size(BLEND)), BLEND, b); + DUMP_UNPACKED(BLEND, b, "Blend RT %d:\n", rt_no); #if PAN_ARCH >= 6 - if (b.internal.mode != MALI_BLEND_MODE_SHADER) - return 0; + if (b.internal.mode != MALI_BLEND_MODE_SHADER) + return 0; - return (frag_shader & 0xFFFFFFFF00000000ULL) | b.internal.shader.pc; + return (frag_shader & 0xFFFFFFFF00000000ULL) | b.internal.shader.pc; #else - return b.blend_shader ? (b.shader_pc & ~0xf) : 0; + return b.blend_shader ? (b.shader_pc & ~0xf) : 0; #endif } #endif @@ -364,207 +371,219 @@ pandecode_blend(void *descs, int rt_no, mali_ptr frag_shader) static unsigned pandecode_attribute_meta(int count, mali_ptr attribute, bool varying) { - unsigned max = 0; + unsigned max = 0; - for (int i = 0; i < count; ++i, attribute += pan_size(ATTRIBUTE)) { - MAP_ADDR(ATTRIBUTE, attribute, cl); - pan_unpack(cl, ATTRIBUTE, a); - DUMP_UNPACKED(ATTRIBUTE, a, "%s:\n", varying ? "Varying" : "Attribute"); - max = MAX2(max, a.buffer_index); - } + for (int i = 0; i < count; ++i, attribute += pan_size(ATTRIBUTE)) { + MAP_ADDR(ATTRIBUTE, attribute, cl); + pan_unpack(cl, ATTRIBUTE, a); + DUMP_UNPACKED(ATTRIBUTE, a, "%s:\n", varying ? "Varying" : "Attribute"); + max = MAX2(max, a.buffer_index); + } - pandecode_log("\n"); - return MIN2(max + 1, 256); + pandecode_log("\n"); + return MIN2(max + 1, 256); } /* return bits [lo, hi) of word */ static u32 bits(u32 word, u32 lo, u32 hi) { - if (hi - lo >= 32) - return word; // avoid undefined behavior with the shift + if (hi - lo >= 32) + return word; // avoid undefined behavior with the shift - if (lo >= 32) - return 0; + if (lo >= 32) + return 0; - return (word >> lo) & ((1 << (hi - lo)) - 1); + return (word >> lo) & ((1 << (hi - lo)) - 1); } static void pandecode_invocation(const void *i) { - /* Decode invocation_count. See the comment before the definition of - * invocation_count for an explanation. - */ - pan_unpack(i, INVOCATION, invocation); + /* Decode invocation_count. See the comment before the definition of + * invocation_count for an explanation. + */ + pan_unpack(i, INVOCATION, invocation); - unsigned size_x = bits(invocation.invocations, 0, invocation.size_y_shift) + 1; - unsigned size_y = bits(invocation.invocations, invocation.size_y_shift, invocation.size_z_shift) + 1; - unsigned size_z = bits(invocation.invocations, invocation.size_z_shift, invocation.workgroups_x_shift) + 1; + unsigned size_x = + bits(invocation.invocations, 0, invocation.size_y_shift) + 1; + unsigned size_y = bits(invocation.invocations, invocation.size_y_shift, + invocation.size_z_shift) + + 1; + unsigned size_z = bits(invocation.invocations, invocation.size_z_shift, + invocation.workgroups_x_shift) + + 1; - unsigned groups_x = bits(invocation.invocations, invocation.workgroups_x_shift, invocation.workgroups_y_shift) + 1; - unsigned groups_y = bits(invocation.invocations, invocation.workgroups_y_shift, invocation.workgroups_z_shift) + 1; - unsigned groups_z = bits(invocation.invocations, invocation.workgroups_z_shift, 32) + 1; + unsigned groups_x = + bits(invocation.invocations, invocation.workgroups_x_shift, + invocation.workgroups_y_shift) + + 1; + unsigned groups_y = + bits(invocation.invocations, invocation.workgroups_y_shift, + invocation.workgroups_z_shift) + + 1; + unsigned groups_z = + bits(invocation.invocations, invocation.workgroups_z_shift, 32) + 1; - pandecode_log("Invocation (%d, %d, %d) x (%d, %d, %d)\n", - size_x, size_y, size_z, - groups_x, groups_y, groups_z); + pandecode_log("Invocation (%d, %d, %d) x (%d, %d, %d)\n", size_x, size_y, + size_z, groups_x, groups_y, groups_z); - DUMP_UNPACKED(INVOCATION, invocation, "Invocation:\n") + DUMP_UNPACKED(INVOCATION, invocation, "Invocation:\n") } #endif static void pandecode_primitive(const void *p) { - pan_unpack(p, PRIMITIVE, primitive); - DUMP_UNPACKED(PRIMITIVE, primitive, "Primitive:\n"); + pan_unpack(p, PRIMITIVE, primitive); + DUMP_UNPACKED(PRIMITIVE, primitive, "Primitive:\n"); #if PAN_ARCH <= 7 - /* Validate an index buffer is present if we need one. TODO: verify - * relationship between invocation_count and index_count */ + /* Validate an index buffer is present if we need one. TODO: verify + * relationship between invocation_count and index_count */ - if (primitive.indices) { - /* Grab the size */ - unsigned size = (primitive.index_type == MALI_INDEX_TYPE_UINT32) ? - sizeof(uint32_t) : primitive.index_type; + if (primitive.indices) { + /* Grab the size */ + unsigned size = (primitive.index_type == MALI_INDEX_TYPE_UINT32) + ? sizeof(uint32_t) + : primitive.index_type; - /* Ensure we got a size, and if so, validate the index buffer - * is large enough to hold a full set of indices of the given - * size */ + /* Ensure we got a size, and if so, validate the index buffer + * is large enough to hold a full set of indices of the given + * size */ - if (!size) - pandecode_log("// XXX: index size missing\n"); - else - pandecode_validate_buffer(primitive.indices, primitive.index_count * size); - } else if (primitive.index_type) - pandecode_log("// XXX: unexpected index size\n"); + if (!size) + pandecode_log("// XXX: index size missing\n"); + else + pandecode_validate_buffer(primitive.indices, + primitive.index_count * size); + } else if (primitive.index_type) + pandecode_log("// XXX: unexpected index size\n"); #endif } static void pandecode_primitive_size(const void *s, bool constant) { - pan_unpack(s, PRIMITIVE_SIZE, ps); - if (ps.size_array == 0x0) - return; + pan_unpack(s, PRIMITIVE_SIZE, ps); + if (ps.size_array == 0x0) + return; - DUMP_UNPACKED(PRIMITIVE_SIZE, ps, "Primitive Size:\n") + DUMP_UNPACKED(PRIMITIVE_SIZE, ps, "Primitive Size:\n") } #if PAN_ARCH <= 7 static void pandecode_uniform_buffers(mali_ptr pubufs, int ubufs_count) { - uint64_t *PANDECODE_PTR_VAR(ubufs, pubufs); + uint64_t *PANDECODE_PTR_VAR(ubufs, pubufs); - for (int i = 0; i < ubufs_count; i++) { - mali_ptr addr = (ubufs[i] >> 10) << 2; - unsigned size = addr ? (((ubufs[i] & ((1 << 10) - 1)) + 1) * 16) : 0; + for (int i = 0; i < ubufs_count; i++) { + mali_ptr addr = (ubufs[i] >> 10) << 2; + unsigned size = addr ? (((ubufs[i] & ((1 << 10) - 1)) + 1) * 16) : 0; - pandecode_validate_buffer(addr, size); + pandecode_validate_buffer(addr, size); - char *ptr = pointer_as_memory_reference(addr); - pandecode_log("ubuf_%d[%u] = %s;\n", i, size, ptr); - free(ptr); - } + char *ptr = pointer_as_memory_reference(addr); + pandecode_log("ubuf_%d[%u] = %s;\n", i, size, ptr); + free(ptr); + } - pandecode_log("\n"); + pandecode_log("\n"); } static void pandecode_uniforms(mali_ptr uniforms, unsigned uniform_count) { - pandecode_validate_buffer(uniforms, uniform_count * 16); + pandecode_validate_buffer(uniforms, uniform_count * 16); - char *ptr = pointer_as_memory_reference(uniforms); - pandecode_log("vec4 uniforms[%u] = %s;\n", uniform_count, ptr); - free(ptr); - pandecode_log("\n"); + char *ptr = pointer_as_memory_reference(uniforms); + pandecode_log("vec4 uniforms[%u] = %s;\n", uniform_count, ptr); + free(ptr); + pandecode_log("\n"); } #endif static void pandecode_shader_disassemble(mali_ptr shader_ptr, int type, unsigned gpu_id) { - uint8_t *PANDECODE_PTR_VAR(code, shader_ptr); + uint8_t *PANDECODE_PTR_VAR(code, shader_ptr); - /* Compute maximum possible size */ - struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(shader_ptr); - size_t sz = mem->length - (shader_ptr - mem->gpu_va); + /* Compute maximum possible size */ + struct pandecode_mapped_memory *mem = + pandecode_find_mapped_gpu_mem_containing(shader_ptr); + size_t sz = mem->length - (shader_ptr - mem->gpu_va); - /* Print some boilerplate to clearly denote the assembly (which doesn't - * obey indentation rules), and actually do the disassembly! */ + /* Print some boilerplate to clearly denote the assembly (which doesn't + * obey indentation rules), and actually do the disassembly! */ - pandecode_log_cont("\n\n"); + pandecode_log_cont("\n\n"); #if PAN_ARCH >= 9 - disassemble_valhall(pandecode_dump_stream, (const uint64_t *) code, sz, true); + disassemble_valhall(pandecode_dump_stream, (const uint64_t *)code, sz, true); #elif PAN_ARCH >= 6 && PAN_ARCH <= 7 - disassemble_bifrost(pandecode_dump_stream, code, sz, false); + disassemble_bifrost(pandecode_dump_stream, code, sz, false); #else - disassemble_midgard(pandecode_dump_stream, code, sz, gpu_id, true); + disassemble_midgard(pandecode_dump_stream, code, sz, gpu_id, true); #endif - pandecode_log_cont("\n\n"); + pandecode_log_cont("\n\n"); } #if PAN_ARCH <= 7 static void -pandecode_texture_payload(mali_ptr payload, - enum mali_texture_dimension dim, - enum mali_texture_layout layout, - bool manual_stride, - uint8_t levels, - uint16_t nr_samples, +pandecode_texture_payload(mali_ptr payload, enum mali_texture_dimension dim, + enum mali_texture_layout layout, bool manual_stride, + uint8_t levels, uint16_t nr_samples, uint16_t array_size) { - pandecode_log(".payload = {\n"); - pandecode_indent++; + pandecode_log(".payload = {\n"); + pandecode_indent++; - /* A bunch of bitmap pointers follow. - * We work out the correct number, - * based on the mipmap/cubemap - * properties, but dump extra - * possibilities to futureproof */ + /* A bunch of bitmap pointers follow. + * We work out the correct number, + * based on the mipmap/cubemap + * properties, but dump extra + * possibilities to futureproof */ - int bitmap_count = levels; + int bitmap_count = levels; - /* Miptree for each face */ - if (dim == MALI_TEXTURE_DIMENSION_CUBE) - bitmap_count *= 6; + /* Miptree for each face */ + if (dim == MALI_TEXTURE_DIMENSION_CUBE) + bitmap_count *= 6; - /* Array of layers */ - bitmap_count *= nr_samples; + /* Array of layers */ + bitmap_count *= nr_samples; - /* Array of textures */ - bitmap_count *= array_size; + /* Array of textures */ + bitmap_count *= array_size; - /* Stride for each element */ - if (manual_stride) - bitmap_count *= 2; + /* Stride for each element */ + if (manual_stride) + bitmap_count *= 2; - mali_ptr *pointers_and_strides = pandecode_fetch_gpu_mem(payload, - sizeof(mali_ptr) * bitmap_count); - for (int i = 0; i < bitmap_count; ++i) { - /* How we dump depends if this is a stride or a pointer */ + mali_ptr *pointers_and_strides = + pandecode_fetch_gpu_mem(payload, sizeof(mali_ptr) * bitmap_count); + for (int i = 0; i < bitmap_count; ++i) { + /* How we dump depends if this is a stride or a pointer */ - if (manual_stride && (i & 1)) { - /* signed 32-bit snuck in as a 64-bit pointer */ - uint64_t stride_set = pointers_and_strides[i]; - int32_t row_stride = stride_set; - int32_t surface_stride = stride_set >> 32; - pandecode_log("(mali_ptr) %d /* surface stride */ %d /* row stride */, \n", - surface_stride, row_stride); - } else { - char *a = pointer_as_memory_reference(pointers_and_strides[i]); - pandecode_log("%s, \n", a); - free(a); - } - } + if (manual_stride && (i & 1)) { + /* signed 32-bit snuck in as a 64-bit pointer */ + uint64_t stride_set = pointers_and_strides[i]; + int32_t row_stride = stride_set; + int32_t surface_stride = stride_set >> 32; + pandecode_log( + "(mali_ptr) %d /* surface stride */ %d /* row stride */, \n", + surface_stride, row_stride); + } else { + char *a = pointer_as_memory_reference(pointers_and_strides[i]); + pandecode_log("%s, \n", a); + free(a); + } + } - pandecode_indent--; - pandecode_log("},\n"); + pandecode_indent--; + pandecode_log("},\n"); } #endif @@ -572,45 +591,45 @@ pandecode_texture_payload(mali_ptr payload, static void pandecode_texture(mali_ptr u, unsigned tex) { - const uint8_t *cl = pandecode_fetch_gpu_mem(u, pan_size(TEXTURE)); + const uint8_t *cl = pandecode_fetch_gpu_mem(u, pan_size(TEXTURE)); - pan_unpack(cl, TEXTURE, temp); - DUMP_UNPACKED(TEXTURE, temp, "Texture:\n") + pan_unpack(cl, TEXTURE, temp); + DUMP_UNPACKED(TEXTURE, temp, "Texture:\n") - pandecode_indent++; - unsigned nr_samples = temp.dimension == MALI_TEXTURE_DIMENSION_3D ? - 1 : temp.sample_count; - pandecode_texture_payload(u + pan_size(TEXTURE), - temp.dimension, temp.texel_ordering, temp.manual_stride, - temp.levels, nr_samples, temp.array_size); - pandecode_indent--; + pandecode_indent++; + unsigned nr_samples = + temp.dimension == MALI_TEXTURE_DIMENSION_3D ? 1 : temp.sample_count; + pandecode_texture_payload(u + pan_size(TEXTURE), temp.dimension, + temp.texel_ordering, temp.manual_stride, + temp.levels, nr_samples, temp.array_size); + pandecode_indent--; } #else static void pandecode_texture(const void *cl, unsigned tex) { - pan_unpack(cl, TEXTURE, temp); - DUMP_UNPACKED(TEXTURE, temp, "Texture:\n") + pan_unpack(cl, TEXTURE, temp); + DUMP_UNPACKED(TEXTURE, temp, "Texture:\n") - pandecode_indent++; + pandecode_indent++; #if PAN_ARCH >= 9 - int plane_count = temp.levels * temp.array_size; + int plane_count = temp.levels * temp.array_size; - /* Miptree for each face */ - if (temp.dimension == MALI_TEXTURE_DIMENSION_CUBE) - plane_count *= 6; + /* Miptree for each face */ + if (temp.dimension == MALI_TEXTURE_DIMENSION_CUBE) + plane_count *= 6; - for (unsigned i = 0; i < plane_count; ++i) - DUMP_ADDR(PLANE, temp.surfaces + i * pan_size(PLANE), "Plane %u:\n", i); + for (unsigned i = 0; i < plane_count; ++i) + DUMP_ADDR(PLANE, temp.surfaces + i * pan_size(PLANE), "Plane %u:\n", i); #else - unsigned nr_samples = temp.dimension == MALI_TEXTURE_DIMENSION_3D ? - 1 : temp.sample_count; + unsigned nr_samples = + temp.dimension == MALI_TEXTURE_DIMENSION_3D ? 1 : temp.sample_count; - pandecode_texture_payload(temp.surfaces, temp.dimension, temp.texel_ordering, - true, temp.levels, nr_samples, temp.array_size); + pandecode_texture_payload(temp.surfaces, temp.dimension, temp.texel_ordering, + true, temp.levels, nr_samples, temp.array_size); #endif - pandecode_indent--; + pandecode_indent--; } #endif @@ -618,49 +637,50 @@ pandecode_texture(const void *cl, unsigned tex) static void pandecode_textures(mali_ptr textures, unsigned texture_count) { - if (!textures) - return; + if (!textures) + return; - pandecode_log("Textures %"PRIx64":\n", textures); - pandecode_indent++; + pandecode_log("Textures %" PRIx64 ":\n", textures); + pandecode_indent++; #if PAN_ARCH >= 6 - const void *cl = pandecode_fetch_gpu_mem(textures, pan_size(TEXTURE) * - texture_count); + const void *cl = + pandecode_fetch_gpu_mem(textures, pan_size(TEXTURE) * texture_count); - for (unsigned tex = 0; tex < texture_count; ++tex) - pandecode_texture(cl + pan_size(TEXTURE) * tex, tex); + for (unsigned tex = 0; tex < texture_count; ++tex) + pandecode_texture(cl + pan_size(TEXTURE) * tex, tex); #else - mali_ptr *PANDECODE_PTR_VAR(u, textures); + mali_ptr *PANDECODE_PTR_VAR(u, textures); - for (int tex = 0; tex < texture_count; ++tex) { - mali_ptr *PANDECODE_PTR_VAR(u, textures + tex * sizeof(mali_ptr)); - char *a = pointer_as_memory_reference(*u); - pandecode_log("%s,\n", a); - free(a); - } + for (int tex = 0; tex < texture_count; ++tex) { + mali_ptr *PANDECODE_PTR_VAR(u, textures + tex * sizeof(mali_ptr)); + char *a = pointer_as_memory_reference(*u); + pandecode_log("%s,\n", a); + free(a); + } - /* Now, finally, descend down into the texture descriptor */ - for (unsigned tex = 0; tex < texture_count; ++tex) { - mali_ptr *PANDECODE_PTR_VAR(u, textures + tex * sizeof(mali_ptr)); - pandecode_texture(*u, tex); - } + /* Now, finally, descend down into the texture descriptor */ + for (unsigned tex = 0; tex < texture_count; ++tex) { + mali_ptr *PANDECODE_PTR_VAR(u, textures + tex * sizeof(mali_ptr)); + pandecode_texture(*u, tex); + } #endif - pandecode_indent--; - pandecode_log("\n"); + pandecode_indent--; + pandecode_log("\n"); } static void pandecode_samplers(mali_ptr samplers, unsigned sampler_count) { - pandecode_log("Samplers %"PRIx64":\n", samplers); - pandecode_indent++; + pandecode_log("Samplers %" PRIx64 ":\n", samplers); + pandecode_indent++; - for (int i = 0; i < sampler_count; ++i) - DUMP_ADDR(SAMPLER, samplers + (pan_size(SAMPLER) * i), "Sampler %d:\n", i); + for (int i = 0; i < sampler_count; ++i) + DUMP_ADDR(SAMPLER, samplers + (pan_size(SAMPLER) * i), "Sampler %d:\n", + i); - pandecode_indent--; - pandecode_log("\n"); + pandecode_indent--; + pandecode_log("\n"); } static void @@ -668,144 +688,147 @@ pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, unsigned gpu_id) { #if PAN_ARCH >= 5 - struct pandecode_fbd fbd_info = { - .rt_count = 1 - }; + struct pandecode_fbd fbd_info = {.rt_count = 1}; #endif - if (PAN_ARCH >= 6 || (PAN_ARCH == 5 && job_type != MALI_JOB_TYPE_TILER)) { + if (PAN_ARCH >= 6 || (PAN_ARCH == 5 && job_type != MALI_JOB_TYPE_TILER)) { #if PAN_ARCH >= 5 - pandecode_local_storage(p->thread_storage & ~1); + pandecode_local_storage(p->thread_storage & ~1); #endif - } else { + } else { #if PAN_ARCH <= 5 - pandecode_fbd(p->fbd, false, gpu_id); + pandecode_fbd(p->fbd, false, gpu_id); #endif - } + } - int varying_count = 0, attribute_count = 0, uniform_count = 0, uniform_buffer_count = 0; - int texture_count = 0, sampler_count = 0; + int varying_count = 0, attribute_count = 0, uniform_count = 0, + uniform_buffer_count = 0; + int texture_count = 0, sampler_count = 0; - if (p->state) { - uint32_t *cl = pandecode_fetch_gpu_mem(p->state, pan_size(RENDERER_STATE)); + if (p->state) { + uint32_t *cl = + pandecode_fetch_gpu_mem(p->state, pan_size(RENDERER_STATE)); - pan_unpack(cl, RENDERER_STATE, state); + pan_unpack(cl, RENDERER_STATE, state); - if (state.shader.shader & ~0xF) - pandecode_shader_disassemble(state.shader.shader & ~0xF, job_type, gpu_id); + if (state.shader.shader & ~0xF) + pandecode_shader_disassemble(state.shader.shader & ~0xF, job_type, + gpu_id); #if PAN_ARCH >= 6 - bool idvs = (job_type == MALI_JOB_TYPE_INDEXED_VERTEX); + bool idvs = (job_type == MALI_JOB_TYPE_INDEXED_VERTEX); - if (idvs && state.secondary_shader) - pandecode_shader_disassemble(state.secondary_shader, job_type, gpu_id); + if (idvs && state.secondary_shader) + pandecode_shader_disassemble(state.secondary_shader, job_type, gpu_id); #endif - DUMP_UNPACKED(RENDERER_STATE, state, "State:\n"); - pandecode_indent++; + DUMP_UNPACKED(RENDERER_STATE, state, "State:\n"); + pandecode_indent++; - /* Save for dumps */ - attribute_count = state.shader.attribute_count; - varying_count = state.shader.varying_count; - texture_count = state.shader.texture_count; - sampler_count = state.shader.sampler_count; - uniform_buffer_count = state.properties.uniform_buffer_count; + /* Save for dumps */ + attribute_count = state.shader.attribute_count; + varying_count = state.shader.varying_count; + texture_count = state.shader.texture_count; + sampler_count = state.shader.sampler_count; + uniform_buffer_count = state.properties.uniform_buffer_count; #if PAN_ARCH >= 6 - uniform_count = state.preload.uniform_count; + uniform_count = state.preload.uniform_count; #else - uniform_count = state.properties.uniform_count; + uniform_count = state.properties.uniform_count; #endif #if PAN_ARCH == 4 - mali_ptr shader = state.blend_shader & ~0xF; - if (state.multisample_misc.blend_shader && shader) - pandecode_shader_disassemble(shader, job_type, gpu_id); + mali_ptr shader = state.blend_shader & ~0xF; + if (state.multisample_misc.blend_shader && shader) + pandecode_shader_disassemble(shader, job_type, gpu_id); #endif - pandecode_indent--; - pandecode_log("\n"); + pandecode_indent--; + pandecode_log("\n"); - /* MRT blend fields are used whenever MFBD is used, with - * per-RT descriptors */ + /* MRT blend fields are used whenever MFBD is used, with + * per-RT descriptors */ #if PAN_ARCH >= 5 - if ((job_type == MALI_JOB_TYPE_TILER || job_type == MALI_JOB_TYPE_FRAGMENT) && - (PAN_ARCH >= 6 || p->thread_storage & MALI_FBD_TAG_IS_MFBD)) { - void* blend_base = ((void *) cl) + pan_size(RENDERER_STATE); + if ((job_type == MALI_JOB_TYPE_TILER || + job_type == MALI_JOB_TYPE_FRAGMENT) && + (PAN_ARCH >= 6 || p->thread_storage & MALI_FBD_TAG_IS_MFBD)) { + void *blend_base = ((void *)cl) + pan_size(RENDERER_STATE); - for (unsigned i = 0; i < fbd_info.rt_count; i++) { - mali_ptr shader = - pandecode_blend(blend_base, i, - state.shader.shader); - if (shader & ~0xF) - pandecode_shader_disassemble(shader, job_type, - gpu_id); - } - } + for (unsigned i = 0; i < fbd_info.rt_count; i++) { + mali_ptr shader = + pandecode_blend(blend_base, i, state.shader.shader); + if (shader & ~0xF) + pandecode_shader_disassemble(shader, job_type, gpu_id); + } + } #endif - } else - pandecode_log("// XXX: missing shader descriptor\n"); + } else + pandecode_log("// XXX: missing shader descriptor\n"); - if (p->viewport) { - DUMP_ADDR(VIEWPORT, p->viewport, "Viewport:\n"); - pandecode_log("\n"); - } + if (p->viewport) { + DUMP_ADDR(VIEWPORT, p->viewport, "Viewport:\n"); + pandecode_log("\n"); + } - unsigned max_attr_index = 0; + unsigned max_attr_index = 0; - if (p->attributes) - max_attr_index = pandecode_attribute_meta(attribute_count, p->attributes, false); + if (p->attributes) + max_attr_index = + pandecode_attribute_meta(attribute_count, p->attributes, false); - if (p->attribute_buffers) - pandecode_attributes(p->attribute_buffers, max_attr_index, false, job_type); + if (p->attribute_buffers) + pandecode_attributes(p->attribute_buffers, max_attr_index, false, + job_type); - if (p->varyings) { - varying_count = pandecode_attribute_meta(varying_count, p->varyings, true); - } + if (p->varyings) { + varying_count = + pandecode_attribute_meta(varying_count, p->varyings, true); + } - if (p->varying_buffers) - pandecode_attributes(p->varying_buffers, varying_count, true, job_type); + if (p->varying_buffers) + pandecode_attributes(p->varying_buffers, varying_count, true, job_type); - if (p->uniform_buffers) { - if (uniform_buffer_count) - pandecode_uniform_buffers(p->uniform_buffers, uniform_buffer_count); - else - pandecode_log("// warn: UBOs specified but not referenced\n"); - } else if (uniform_buffer_count) - pandecode_log("// XXX: UBOs referenced but not specified\n"); + if (p->uniform_buffers) { + if (uniform_buffer_count) + pandecode_uniform_buffers(p->uniform_buffers, uniform_buffer_count); + else + pandecode_log("// warn: UBOs specified but not referenced\n"); + } else if (uniform_buffer_count) + pandecode_log("// XXX: UBOs referenced but not specified\n"); - /* We don't want to actually dump uniforms, but we do need to validate - * that the counts we were given are sane */ + /* We don't want to actually dump uniforms, but we do need to validate + * that the counts we were given are sane */ - if (p->push_uniforms) { - if (uniform_count) - pandecode_uniforms(p->push_uniforms, uniform_count); - else - pandecode_log("// warn: Uniforms specified but not referenced\n"); - } else if (uniform_count) - pandecode_log("// XXX: Uniforms referenced but not specified\n"); + if (p->push_uniforms) { + if (uniform_count) + pandecode_uniforms(p->push_uniforms, uniform_count); + else + pandecode_log("// warn: Uniforms specified but not referenced\n"); + } else if (uniform_count) + pandecode_log("// XXX: Uniforms referenced but not specified\n"); - if (p->textures) - pandecode_textures(p->textures, texture_count); + if (p->textures) + pandecode_textures(p->textures, texture_count); - if (p->samplers) - pandecode_samplers(p->samplers, sampler_count); + if (p->samplers) + pandecode_samplers(p->samplers, sampler_count); } static void pandecode_vertex_compute_geometry_job(const struct MALI_JOB_HEADER *h, mali_ptr job, unsigned gpu_id) { - struct mali_compute_job_packed *PANDECODE_PTR_VAR(p, job); - pan_section_unpack(p, COMPUTE_JOB, DRAW, draw); - pandecode_dcd(&draw, h->type, gpu_id); + struct mali_compute_job_packed *PANDECODE_PTR_VAR(p, job); + pan_section_unpack(p, COMPUTE_JOB, DRAW, draw); + pandecode_dcd(&draw, h->type, gpu_id); - pandecode_log("Vertex Job Payload:\n"); - pandecode_indent++; - pandecode_invocation(pan_section_ptr(p, COMPUTE_JOB, INVOCATION)); - DUMP_SECTION(COMPUTE_JOB, PARAMETERS, p, "Vertex Job Parameters:\n"); - DUMP_UNPACKED(DRAW, draw, "Draw:\n"); - pandecode_indent--; - pandecode_log("\n"); + pandecode_log("Vertex Job Payload:\n"); + pandecode_indent++; + pandecode_invocation(pan_section_ptr(p, COMPUTE_JOB, INVOCATION)); + DUMP_SECTION(COMPUTE_JOB, PARAMETERS, p, "Vertex Job Parameters:\n"); + DUMP_UNPACKED(DRAW, draw, "Draw:\n"); + pandecode_indent--; + pandecode_log("\n"); } #endif @@ -813,318 +836,324 @@ pandecode_vertex_compute_geometry_job(const struct MALI_JOB_HEADER *h, static void pandecode_tiler(mali_ptr gpu_va) { - pan_unpack(PANDECODE_PTR(gpu_va, void), TILER_CONTEXT, t); + pan_unpack(PANDECODE_PTR(gpu_va, void), TILER_CONTEXT, t); - if (t.heap) { - pan_unpack(PANDECODE_PTR(t.heap, void), TILER_HEAP, h); - DUMP_UNPACKED(TILER_HEAP, h, "Tiler Heap:\n"); - } + if (t.heap) { + pan_unpack(PANDECODE_PTR(t.heap, void), TILER_HEAP, h); + DUMP_UNPACKED(TILER_HEAP, h, "Tiler Heap:\n"); + } - DUMP_UNPACKED(TILER_CONTEXT, t, "Tiler:\n"); + DUMP_UNPACKED(TILER_CONTEXT, t, "Tiler:\n"); } #if PAN_ARCH <= 7 static void -pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h, - mali_ptr job, unsigned gpu_id) +pandecode_indexed_vertex_job(const struct MALI_JOB_HEADER *h, mali_ptr job, + unsigned gpu_id) { - struct mali_indexed_vertex_job_packed *PANDECODE_PTR_VAR(p, job); + struct mali_indexed_vertex_job_packed *PANDECODE_PTR_VAR(p, job); - pandecode_log("Vertex:\n"); - pan_section_unpack(p, INDEXED_VERTEX_JOB, VERTEX_DRAW, vert_draw); - pandecode_dcd(&vert_draw, h->type, gpu_id); - DUMP_UNPACKED(DRAW, vert_draw, "Vertex Draw:\n"); + pandecode_log("Vertex:\n"); + pan_section_unpack(p, INDEXED_VERTEX_JOB, VERTEX_DRAW, vert_draw); + pandecode_dcd(&vert_draw, h->type, gpu_id); + DUMP_UNPACKED(DRAW, vert_draw, "Vertex Draw:\n"); - pandecode_log("Fragment:\n"); - pan_section_unpack(p, INDEXED_VERTEX_JOB, FRAGMENT_DRAW, frag_draw); - pandecode_dcd(&frag_draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); - DUMP_UNPACKED(DRAW, frag_draw, "Fragment Draw:\n"); + pandecode_log("Fragment:\n"); + pan_section_unpack(p, INDEXED_VERTEX_JOB, FRAGMENT_DRAW, frag_draw); + pandecode_dcd(&frag_draw, MALI_JOB_TYPE_FRAGMENT, gpu_id); + DUMP_UNPACKED(DRAW, frag_draw, "Fragment Draw:\n"); - pan_section_unpack(p, INDEXED_VERTEX_JOB, TILER, tiler_ptr); - pandecode_log("Tiler Job Payload:\n"); - pandecode_indent++; - pandecode_tiler(tiler_ptr.address); - pandecode_indent--; + pan_section_unpack(p, INDEXED_VERTEX_JOB, TILER, tiler_ptr); + pandecode_log("Tiler Job Payload:\n"); + pandecode_indent++; + pandecode_tiler(tiler_ptr.address); + pandecode_indent--; - pandecode_invocation(pan_section_ptr(p, INDEXED_VERTEX_JOB, INVOCATION)); - pandecode_primitive(pan_section_ptr(p, INDEXED_VERTEX_JOB, PRIMITIVE)); + pandecode_invocation(pan_section_ptr(p, INDEXED_VERTEX_JOB, INVOCATION)); + pandecode_primitive(pan_section_ptr(p, INDEXED_VERTEX_JOB, PRIMITIVE)); - /* TODO: gl_PointSize on Bifrost */ - pandecode_primitive_size(pan_section_ptr(p, INDEXED_VERTEX_JOB, PRIMITIVE_SIZE), true); + /* TODO: gl_PointSize on Bifrost */ + pandecode_primitive_size( + pan_section_ptr(p, INDEXED_VERTEX_JOB, PRIMITIVE_SIZE), true); - pan_section_unpack(p, INDEXED_VERTEX_JOB, PADDING, padding); + pan_section_unpack(p, INDEXED_VERTEX_JOB, PADDING, padding); } #endif #endif static void -pandecode_tiler_job(const struct MALI_JOB_HEADER *h, - mali_ptr job, unsigned gpu_id) +pandecode_tiler_job(const struct MALI_JOB_HEADER *h, mali_ptr job, + unsigned gpu_id) { - struct mali_tiler_job_packed *PANDECODE_PTR_VAR(p, job); - pan_section_unpack(p, TILER_JOB, DRAW, draw); - pandecode_dcd(&draw, h->type, gpu_id); - pandecode_log("Tiler Job Payload:\n"); - pandecode_indent++; + struct mali_tiler_job_packed *PANDECODE_PTR_VAR(p, job); + pan_section_unpack(p, TILER_JOB, DRAW, draw); + pandecode_dcd(&draw, h->type, gpu_id); + pandecode_log("Tiler Job Payload:\n"); + pandecode_indent++; #if PAN_ARCH <= 7 - pandecode_invocation(pan_section_ptr(p, TILER_JOB, INVOCATION)); + pandecode_invocation(pan_section_ptr(p, TILER_JOB, INVOCATION)); #endif - pandecode_primitive(pan_section_ptr(p, TILER_JOB, PRIMITIVE)); - DUMP_UNPACKED(DRAW, draw, "Draw:\n"); + pandecode_primitive(pan_section_ptr(p, TILER_JOB, PRIMITIVE)); + DUMP_UNPACKED(DRAW, draw, "Draw:\n"); #if PAN_ARCH >= 6 - pan_section_unpack(p, TILER_JOB, TILER, tiler_ptr); - pandecode_tiler(tiler_ptr.address); + pan_section_unpack(p, TILER_JOB, TILER, tiler_ptr); + pandecode_tiler(tiler_ptr.address); - /* TODO: gl_PointSize on Bifrost */ - pandecode_primitive_size(pan_section_ptr(p, TILER_JOB, PRIMITIVE_SIZE), true); + /* TODO: gl_PointSize on Bifrost */ + pandecode_primitive_size(pan_section_ptr(p, TILER_JOB, PRIMITIVE_SIZE), + true); #if PAN_ARCH >= 9 - DUMP_SECTION(TILER_JOB, INSTANCE_COUNT, p, "Instance count:\n"); - DUMP_SECTION(TILER_JOB, VERTEX_COUNT, p, "Vertex count:\n"); - DUMP_SECTION(TILER_JOB, SCISSOR, p, "Scissor:\n"); - DUMP_SECTION(TILER_JOB, INDICES, p, "Indices:\n"); + DUMP_SECTION(TILER_JOB, INSTANCE_COUNT, p, "Instance count:\n"); + DUMP_SECTION(TILER_JOB, VERTEX_COUNT, p, "Vertex count:\n"); + DUMP_SECTION(TILER_JOB, SCISSOR, p, "Scissor:\n"); + DUMP_SECTION(TILER_JOB, INDICES, p, "Indices:\n"); #else - pan_section_unpack(p, TILER_JOB, PADDING, padding); + pan_section_unpack(p, TILER_JOB, PADDING, padding); #endif #else - pan_section_unpack(p, TILER_JOB, PRIMITIVE, primitive); - pandecode_primitive_size(pan_section_ptr(p, TILER_JOB, PRIMITIVE_SIZE), - primitive.point_size_array_format == MALI_POINT_SIZE_ARRAY_FORMAT_NONE); + pan_section_unpack(p, TILER_JOB, PRIMITIVE, primitive); + pandecode_primitive_size( + pan_section_ptr(p, TILER_JOB, PRIMITIVE_SIZE), + primitive.point_size_array_format == MALI_POINT_SIZE_ARRAY_FORMAT_NONE); #endif - pandecode_indent--; - pandecode_log("\n"); + pandecode_indent--; + pandecode_log("\n"); } static void pandecode_fragment_job(mali_ptr job, unsigned gpu_id) { - struct mali_fragment_job_packed *PANDECODE_PTR_VAR(p, job); - pan_section_unpack(p, FRAGMENT_JOB, PAYLOAD, s); + struct mali_fragment_job_packed *PANDECODE_PTR_VAR(p, job); + pan_section_unpack(p, FRAGMENT_JOB, PAYLOAD, s); - UNUSED struct pandecode_fbd info = pandecode_fbd(s.framebuffer, true, gpu_id); + UNUSED struct pandecode_fbd info = + pandecode_fbd(s.framebuffer, true, gpu_id); #if PAN_ARCH >= 5 - unsigned expected_tag = 0; + unsigned expected_tag = 0; - /* Compute the tag for the tagged pointer. This contains the type of - * FBD (MFBD/SFBD), and in the case of an MFBD, information about which - * additional structures follow the MFBD header (an extra payload or - * not, as well as a count of render targets) */ + /* Compute the tag for the tagged pointer. This contains the type of + * FBD (MFBD/SFBD), and in the case of an MFBD, information about which + * additional structures follow the MFBD header (an extra payload or + * not, as well as a count of render targets) */ - expected_tag = MALI_FBD_TAG_IS_MFBD; - if (info.has_extra) - expected_tag |= MALI_FBD_TAG_HAS_ZS_RT; + expected_tag = MALI_FBD_TAG_IS_MFBD; + if (info.has_extra) + expected_tag |= MALI_FBD_TAG_HAS_ZS_RT; - expected_tag |= MALI_FBD_TAG_IS_MFBD | (MALI_POSITIVE(info.rt_count) << 2); + expected_tag |= MALI_FBD_TAG_IS_MFBD | (MALI_POSITIVE(info.rt_count) << 2); #endif - DUMP_UNPACKED(FRAGMENT_JOB_PAYLOAD, s, "Fragment Job Payload:\n"); + DUMP_UNPACKED(FRAGMENT_JOB_PAYLOAD, s, "Fragment Job Payload:\n"); #if PAN_ARCH >= 5 - /* The FBD is a tagged pointer */ + /* The FBD is a tagged pointer */ - unsigned tag = (s.framebuffer & MALI_FBD_TAG_MASK); + unsigned tag = (s.framebuffer & MALI_FBD_TAG_MASK); - if (tag != expected_tag) - pandecode_log("// XXX: expected FBD tag %X but got %X\n", expected_tag, tag); + if (tag != expected_tag) + pandecode_log("// XXX: expected FBD tag %X but got %X\n", expected_tag, + tag); #endif - pandecode_log("\n"); + pandecode_log("\n"); } static void pandecode_write_value_job(mali_ptr job) { - struct mali_write_value_job_packed *PANDECODE_PTR_VAR(p, job); - pan_section_unpack(p, WRITE_VALUE_JOB, PAYLOAD, u); - DUMP_SECTION(WRITE_VALUE_JOB, PAYLOAD, p, "Write Value Payload:\n"); - pandecode_log("\n"); + struct mali_write_value_job_packed *PANDECODE_PTR_VAR(p, job); + pan_section_unpack(p, WRITE_VALUE_JOB, PAYLOAD, u); + DUMP_SECTION(WRITE_VALUE_JOB, PAYLOAD, p, "Write Value Payload:\n"); + pandecode_log("\n"); } static void pandecode_cache_flush_job(mali_ptr job) { - struct mali_cache_flush_job_packed *PANDECODE_PTR_VAR(p, job); - pan_section_unpack(p, CACHE_FLUSH_JOB, PAYLOAD, u); - DUMP_SECTION(CACHE_FLUSH_JOB, PAYLOAD, p, "Cache Flush Payload:\n"); - pandecode_log("\n"); + struct mali_cache_flush_job_packed *PANDECODE_PTR_VAR(p, job); + pan_section_unpack(p, CACHE_FLUSH_JOB, PAYLOAD, u); + DUMP_SECTION(CACHE_FLUSH_JOB, PAYLOAD, p, "Cache Flush Payload:\n"); + pandecode_log("\n"); } #if PAN_ARCH >= 9 static void dump_fau(mali_ptr addr, unsigned count, const char *name) { - const uint32_t *PANDECODE_PTR_VAR(raw, addr); + const uint32_t *PANDECODE_PTR_VAR(raw, addr); - pandecode_validate_buffer(addr, count * 8); + pandecode_validate_buffer(addr, count * 8); - fprintf(pandecode_dump_stream, "%s:\n", name); - for (unsigned i = 0; i < count; ++i) { - fprintf(pandecode_dump_stream, " %08X %08X\n", - raw[2*i], raw[2*i + 1]); - } - fprintf(pandecode_dump_stream, "\n"); + fprintf(pandecode_dump_stream, "%s:\n", name); + for (unsigned i = 0; i < count; ++i) { + fprintf(pandecode_dump_stream, " %08X %08X\n", raw[2 * i], + raw[2 * i + 1]); + } + fprintf(pandecode_dump_stream, "\n"); } static mali_ptr pandecode_shader(mali_ptr addr, const char *label, unsigned gpu_id) { - MAP_ADDR(SHADER_PROGRAM, addr, cl); - pan_unpack(cl, SHADER_PROGRAM, desc); + MAP_ADDR(SHADER_PROGRAM, addr, cl); + pan_unpack(cl, SHADER_PROGRAM, desc); - assert(desc.type == 8); + assert(desc.type == 8); - DUMP_UNPACKED(SHADER_PROGRAM, desc, "%s Shader:\n", label); - pandecode_shader_disassemble(desc.binary, 0, gpu_id); - return desc.binary; + DUMP_UNPACKED(SHADER_PROGRAM, desc, "%s Shader:\n", label); + pandecode_shader_disassemble(desc.binary, 0, gpu_id); + return desc.binary; } static void pandecode_resources(mali_ptr addr, unsigned size) { - const uint8_t *cl = pandecode_fetch_gpu_mem(addr, size); - assert((size % 0x20) == 0); + const uint8_t *cl = pandecode_fetch_gpu_mem(addr, size); + assert((size % 0x20) == 0); - for (unsigned i = 0; i < size; i += 0x20) { - unsigned type = (cl[i] & 0xF); + for (unsigned i = 0; i < size; i += 0x20) { + unsigned type = (cl[i] & 0xF); - switch (type) { - case MALI_DESCRIPTOR_TYPE_SAMPLER: - DUMP_CL(SAMPLER, cl + i, "Sampler:\n"); - break; - case MALI_DESCRIPTOR_TYPE_TEXTURE: - pandecode_texture(cl + i, i); - break; - case MALI_DESCRIPTOR_TYPE_ATTRIBUTE: - DUMP_CL(ATTRIBUTE, cl + i, "Attribute:\n"); - break; - case MALI_DESCRIPTOR_TYPE_BUFFER: - DUMP_CL(BUFFER, cl + i, "Buffer:\n"); - break; - default: - fprintf(pandecode_dump_stream, "Unknown descriptor type %X\n", type); - break; - } - } + switch (type) { + case MALI_DESCRIPTOR_TYPE_SAMPLER: + DUMP_CL(SAMPLER, cl + i, "Sampler:\n"); + break; + case MALI_DESCRIPTOR_TYPE_TEXTURE: + pandecode_texture(cl + i, i); + break; + case MALI_DESCRIPTOR_TYPE_ATTRIBUTE: + DUMP_CL(ATTRIBUTE, cl + i, "Attribute:\n"); + break; + case MALI_DESCRIPTOR_TYPE_BUFFER: + DUMP_CL(BUFFER, cl + i, "Buffer:\n"); + break; + default: + fprintf(pandecode_dump_stream, "Unknown descriptor type %X\n", type); + break; + } + } } static void pandecode_resource_tables(mali_ptr addr, const char *label) { - unsigned count = addr & 0x3F; - addr = addr & ~0x3F; + unsigned count = addr & 0x3F; + addr = addr & ~0x3F; - const uint8_t *cl = pandecode_fetch_gpu_mem(addr, MALI_RESOURCE_LENGTH * count); + const uint8_t *cl = + pandecode_fetch_gpu_mem(addr, MALI_RESOURCE_LENGTH * count); - for (unsigned i = 0; i < count; ++i) { - pan_unpack(cl + i * MALI_RESOURCE_LENGTH, RESOURCE, entry); - DUMP_UNPACKED(RESOURCE, entry, "Entry %u:\n", i); + for (unsigned i = 0; i < count; ++i) { + pan_unpack(cl + i * MALI_RESOURCE_LENGTH, RESOURCE, entry); + DUMP_UNPACKED(RESOURCE, entry, "Entry %u:\n", i); - pandecode_indent += 2; - if (entry.address) - pandecode_resources(entry.address, entry.size); - pandecode_indent -= 2; - } + pandecode_indent += 2; + if (entry.address) + pandecode_resources(entry.address, entry.size); + pandecode_indent -= 2; + } } static void pandecode_depth_stencil(mali_ptr addr) { - MAP_ADDR(DEPTH_STENCIL, addr, cl); - pan_unpack(cl, DEPTH_STENCIL, desc); - DUMP_UNPACKED(DEPTH_STENCIL, desc, "Depth/stencil"); + MAP_ADDR(DEPTH_STENCIL, addr, cl); + pan_unpack(cl, DEPTH_STENCIL, desc); + DUMP_UNPACKED(DEPTH_STENCIL, desc, "Depth/stencil"); } static void pandecode_shader_environment(const struct MALI_SHADER_ENVIRONMENT *p, unsigned gpu_id) { - if (p->shader) - pandecode_shader(p->shader, "Shader", gpu_id); + if (p->shader) + pandecode_shader(p->shader, "Shader", gpu_id); - if (p->resources) - pandecode_resource_tables(p->resources, "Resources"); + if (p->resources) + pandecode_resource_tables(p->resources, "Resources"); - if (p->thread_storage) - pandecode_local_storage(p->thread_storage); + if (p->thread_storage) + pandecode_local_storage(p->thread_storage); - if (p->fau) - dump_fau(p->fau, p->fau_count, "FAU"); + if (p->fau) + dump_fau(p->fau, p->fau_count, "FAU"); } static void pandecode_dcd(const struct MALI_DRAW *p, enum mali_job_type job_type, unsigned gpu_id) { - mali_ptr frag_shader = 0; + mali_ptr frag_shader = 0; - pandecode_depth_stencil(p->depth_stencil); + pandecode_depth_stencil(p->depth_stencil); - for (unsigned i = 0; i < p->blend_count; ++i) { - struct mali_blend_packed *PANDECODE_PTR_VAR(blend_descs, p->blend); + for (unsigned i = 0; i < p->blend_count; ++i) { + struct mali_blend_packed *PANDECODE_PTR_VAR(blend_descs, p->blend); - mali_ptr blend_shader = pandecode_blend(blend_descs, i, frag_shader); - if (blend_shader) { - fprintf(pandecode_dump_stream, "Blend shader %u", i); - pandecode_shader_disassemble(blend_shader, 0, gpu_id); - } - } + mali_ptr blend_shader = pandecode_blend(blend_descs, i, frag_shader); + if (blend_shader) { + fprintf(pandecode_dump_stream, "Blend shader %u", i); + pandecode_shader_disassemble(blend_shader, 0, gpu_id); + } + } - pandecode_shader_environment(&p->shader, gpu_id); - DUMP_UNPACKED(DRAW, *p, "Draw:\n"); + pandecode_shader_environment(&p->shader, gpu_id); + DUMP_UNPACKED(DRAW, *p, "Draw:\n"); } static void pandecode_malloc_vertex_job(mali_ptr job, unsigned gpu_id) { - struct mali_malloc_vertex_job_packed *PANDECODE_PTR_VAR(p, job); + struct mali_malloc_vertex_job_packed *PANDECODE_PTR_VAR(p, job); - DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE, p, "Primitive:\n"); - DUMP_SECTION(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, "Instance count:\n"); - DUMP_SECTION(MALLOC_VERTEX_JOB, ALLOCATION, p, "Allocation:\n"); - DUMP_SECTION(MALLOC_VERTEX_JOB, TILER, p, "Tiler:\n"); - DUMP_SECTION(MALLOC_VERTEX_JOB, SCISSOR, p, "Scissor:\n"); - DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, "Primitive Size:\n"); - DUMP_SECTION(MALLOC_VERTEX_JOB, INDICES, p, "Indices:\n"); + DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE, p, "Primitive:\n"); + DUMP_SECTION(MALLOC_VERTEX_JOB, INSTANCE_COUNT, p, "Instance count:\n"); + DUMP_SECTION(MALLOC_VERTEX_JOB, ALLOCATION, p, "Allocation:\n"); + DUMP_SECTION(MALLOC_VERTEX_JOB, TILER, p, "Tiler:\n"); + DUMP_SECTION(MALLOC_VERTEX_JOB, SCISSOR, p, "Scissor:\n"); + DUMP_SECTION(MALLOC_VERTEX_JOB, PRIMITIVE_SIZE, p, "Primitive Size:\n"); + DUMP_SECTION(MALLOC_VERTEX_JOB, INDICES, p, "Indices:\n"); - pan_section_unpack(p, MALLOC_VERTEX_JOB, DRAW, dcd); + pan_section_unpack(p, MALLOC_VERTEX_JOB, DRAW, dcd); - pan_section_unpack(p, MALLOC_VERTEX_JOB, TILER, tiler_ptr); - pandecode_log("Tiler Job Payload:\n"); - pandecode_indent++; - if (tiler_ptr.address) - pandecode_tiler(tiler_ptr.address); - else - pandecode_log("\n"); - pandecode_indent--; + pan_section_unpack(p, MALLOC_VERTEX_JOB, TILER, tiler_ptr); + pandecode_log("Tiler Job Payload:\n"); + pandecode_indent++; + if (tiler_ptr.address) + pandecode_tiler(tiler_ptr.address); + else + pandecode_log("\n"); + pandecode_indent--; - pandecode_dcd(&dcd, 0, gpu_id); + pandecode_dcd(&dcd, 0, gpu_id); - pan_section_unpack(p, MALLOC_VERTEX_JOB, POSITION, position); - pan_section_unpack(p, MALLOC_VERTEX_JOB, VARYING, varying); - pandecode_shader_environment(&position, gpu_id); - pandecode_shader_environment(&varying, gpu_id); + pan_section_unpack(p, MALLOC_VERTEX_JOB, POSITION, position); + pan_section_unpack(p, MALLOC_VERTEX_JOB, VARYING, varying); + pandecode_shader_environment(&position, gpu_id); + pandecode_shader_environment(&varying, gpu_id); } static void pandecode_compute_job(mali_ptr job, unsigned gpu_id) { - struct mali_compute_job_packed *PANDECODE_PTR_VAR(p, job); - pan_section_unpack(p, COMPUTE_JOB, PAYLOAD, payload); + struct mali_compute_job_packed *PANDECODE_PTR_VAR(p, job); + pan_section_unpack(p, COMPUTE_JOB, PAYLOAD, payload); - pandecode_shader(payload.compute.shader, "Shader", gpu_id); - if (payload.compute.thread_storage) - pandecode_local_storage(payload.compute.thread_storage); - if (payload.compute.fau) - dump_fau(payload.compute.fau, payload.compute.fau_count, "FAU"); - if (payload.compute.resources) - pandecode_resource_tables(payload.compute.resources, "Resources"); + pandecode_shader(payload.compute.shader, "Shader", gpu_id); + if (payload.compute.thread_storage) + pandecode_local_storage(payload.compute.thread_storage); + if (payload.compute.fau) + dump_fau(payload.compute.fau, payload.compute.fau_count, "FAU"); + if (payload.compute.resources) + pandecode_resource_tables(payload.compute.resources, "Resources"); - DUMP_UNPACKED(COMPUTE_PAYLOAD, payload, "Compute:\n"); + DUMP_UNPACKED(COMPUTE_PAYLOAD, payload, "Compute:\n"); } #endif @@ -1136,99 +1165,99 @@ pandecode_compute_job(mali_ptr job, unsigned gpu_id) void GENX(pandecode_jc)(mali_ptr jc_gpu_va, unsigned gpu_id) { - pandecode_dump_file_open(); + pandecode_dump_file_open(); - struct set *va_set = _mesa_pointer_set_create(NULL); - struct set_entry *entry = NULL; + struct set *va_set = _mesa_pointer_set_create(NULL); + struct set_entry *entry = NULL; - mali_ptr next_job = 0; + mali_ptr next_job = 0; - do { - struct mali_job_header_packed *hdr = - PANDECODE_PTR(jc_gpu_va, struct mali_job_header_packed); + do { + struct mali_job_header_packed *hdr = + PANDECODE_PTR(jc_gpu_va, struct mali_job_header_packed); - entry = _mesa_set_search(va_set, hdr); - if (entry != NULL) { - fprintf(stdout, "Job list has a cycle\n"); - break; - } + entry = _mesa_set_search(va_set, hdr); + if (entry != NULL) { + fprintf(stdout, "Job list has a cycle\n"); + break; + } - pan_unpack(hdr, JOB_HEADER, h); - next_job = h.next; + pan_unpack(hdr, JOB_HEADER, h); + next_job = h.next; - DUMP_UNPACKED(JOB_HEADER, h, "Job Header (%" PRIx64 "):\n", jc_gpu_va); - pandecode_log("\n"); + DUMP_UNPACKED(JOB_HEADER, h, "Job Header (%" PRIx64 "):\n", jc_gpu_va); + pandecode_log("\n"); - switch (h.type) { - case MALI_JOB_TYPE_WRITE_VALUE: - pandecode_write_value_job(jc_gpu_va); - break; + switch (h.type) { + case MALI_JOB_TYPE_WRITE_VALUE: + pandecode_write_value_job(jc_gpu_va); + break; - case MALI_JOB_TYPE_CACHE_FLUSH: - pandecode_cache_flush_job(jc_gpu_va); - break; + case MALI_JOB_TYPE_CACHE_FLUSH: + pandecode_cache_flush_job(jc_gpu_va); + break; - case MALI_JOB_TYPE_TILER: - pandecode_tiler_job(&h, jc_gpu_va, gpu_id); - break; + case MALI_JOB_TYPE_TILER: + pandecode_tiler_job(&h, jc_gpu_va, gpu_id); + break; #if PAN_ARCH <= 7 - case MALI_JOB_TYPE_VERTEX: - case MALI_JOB_TYPE_COMPUTE: - pandecode_vertex_compute_geometry_job(&h, jc_gpu_va, gpu_id); - break; + case MALI_JOB_TYPE_VERTEX: + case MALI_JOB_TYPE_COMPUTE: + pandecode_vertex_compute_geometry_job(&h, jc_gpu_va, gpu_id); + break; #if PAN_ARCH >= 6 - case MALI_JOB_TYPE_INDEXED_VERTEX: - pandecode_indexed_vertex_job(&h, jc_gpu_va, gpu_id); - break; + case MALI_JOB_TYPE_INDEXED_VERTEX: + pandecode_indexed_vertex_job(&h, jc_gpu_va, gpu_id); + break; #endif #else - case MALI_JOB_TYPE_COMPUTE: - pandecode_compute_job(jc_gpu_va, gpu_id); - break; + case MALI_JOB_TYPE_COMPUTE: + pandecode_compute_job(jc_gpu_va, gpu_id); + break; - case MALI_JOB_TYPE_MALLOC_VERTEX: - pandecode_malloc_vertex_job(jc_gpu_va, gpu_id); - break; + case MALI_JOB_TYPE_MALLOC_VERTEX: + pandecode_malloc_vertex_job(jc_gpu_va, gpu_id); + break; #endif - case MALI_JOB_TYPE_FRAGMENT: - pandecode_fragment_job(jc_gpu_va, gpu_id); - break; + case MALI_JOB_TYPE_FRAGMENT: + pandecode_fragment_job(jc_gpu_va, gpu_id); + break; - default: - break; - } + default: + break; + } - /* Track the latest visited job CPU VA to detect cycles */ - _mesa_set_add(va_set, hdr); + /* Track the latest visited job CPU VA to detect cycles */ + _mesa_set_add(va_set, hdr); - } while ((jc_gpu_va = next_job)); + } while ((jc_gpu_va = next_job)); - _mesa_set_destroy(va_set, NULL); + _mesa_set_destroy(va_set, NULL); - fflush(pandecode_dump_stream); - pandecode_map_read_write(); + fflush(pandecode_dump_stream); + pandecode_map_read_write(); } void GENX(pandecode_abort_on_fault)(mali_ptr jc_gpu_va) { - mali_ptr next_job = 0; + mali_ptr next_job = 0; - do { - pan_unpack(PANDECODE_PTR(jc_gpu_va, struct mali_job_header_packed), - JOB_HEADER, h); - next_job = h.next; + do { + pan_unpack(PANDECODE_PTR(jc_gpu_va, struct mali_job_header_packed), + JOB_HEADER, h); + next_job = h.next; - /* Ensure the job is marked COMPLETE */ - if (h.exception_status != 0x1) { - fprintf(stderr, "Incomplete job or timeout\n"); - fflush(NULL); - abort(); - } - } while ((jc_gpu_va = next_job)); + /* Ensure the job is marked COMPLETE */ + if (h.exception_status != 0x1) { + fprintf(stderr, "Incomplete job or timeout\n"); + fflush(NULL); + abort(); + } + } while ((jc_gpu_va = next_job)); - pandecode_map_read_write(); + pandecode_map_read_write(); } diff --git a/src/panfrost/lib/genxml/decode.h b/src/panfrost/lib/genxml/decode.h index 6fa6014eb0e..862532b2d44 100644 --- a/src/panfrost/lib/genxml/decode.h +++ b/src/panfrost/lib/genxml/decode.h @@ -36,54 +36,54 @@ extern FILE *pandecode_dump_stream; void pandecode_dump_file_open(void); struct pandecode_mapped_memory { - struct rb_node node; - size_t length; - void *addr; - uint64_t gpu_va; - bool ro; - char name[32]; + struct rb_node node; + size_t length; + void *addr; + uint64_t gpu_va; + bool ro; + char name[32]; }; char *pointer_as_memory_reference(uint64_t ptr); -struct pandecode_mapped_memory *pandecode_find_mapped_gpu_mem_containing(uint64_t addr); +struct pandecode_mapped_memory * +pandecode_find_mapped_gpu_mem_containing(uint64_t addr); void pandecode_map_read_write(void); void pandecode_dump_mappings(void); static inline void * -__pandecode_fetch_gpu_mem(uint64_t gpu_va, size_t size, - int line, const char *filename) +__pandecode_fetch_gpu_mem(uint64_t gpu_va, size_t size, int line, + const char *filename) { - const struct pandecode_mapped_memory *mem = - pandecode_find_mapped_gpu_mem_containing(gpu_va); + const struct pandecode_mapped_memory *mem = + pandecode_find_mapped_gpu_mem_containing(gpu_va); - if (!mem) { - fprintf(stderr, "Access to unknown memory %" PRIx64 " in %s:%d\n", - gpu_va, filename, line); - assert(0); - } + if (!mem) { + fprintf(stderr, "Access to unknown memory %" PRIx64 " in %s:%d\n", gpu_va, + filename, line); + assert(0); + } - assert(size + (gpu_va - mem->gpu_va) <= mem->length); + assert(size + (gpu_va - mem->gpu_va) <= mem->length); - return mem->addr + gpu_va - mem->gpu_va; + return mem->addr + gpu_va - mem->gpu_va; } -#define pandecode_fetch_gpu_mem(gpu_va, size) \ - __pandecode_fetch_gpu_mem(gpu_va, size, __LINE__, __FILE__) +#define pandecode_fetch_gpu_mem(gpu_va, size) \ + __pandecode_fetch_gpu_mem(gpu_va, size, __LINE__, __FILE__) /* Returns a validated pointer to mapped GPU memory with the given pointer type, * size automatically determined from the pointer type */ -#define PANDECODE_PTR(gpu_va, type) \ - ((type*)(__pandecode_fetch_gpu_mem(gpu_va, sizeof(type), \ - __LINE__, __FILE__))) +#define PANDECODE_PTR(gpu_va, type) \ + ((type *)(__pandecode_fetch_gpu_mem(gpu_va, sizeof(type), __LINE__, \ + __FILE__))) /* Usage: PANDECODE_PTR_VAR(name, gpu_va) */ -#define PANDECODE_PTR_VAR(name, gpu_va) \ - name = __pandecode_fetch_gpu_mem(gpu_va, sizeof(*name), \ - __LINE__, __FILE__) +#define PANDECODE_PTR_VAR(name, gpu_va) \ + name = __pandecode_fetch_gpu_mem(gpu_va, sizeof(*name), __LINE__, __FILE__) /* Forward declare for all supported gens to permit thunking */ void pandecode_jc_v4(mali_ptr jc_gpu_va, unsigned gpu_id); @@ -101,44 +101,44 @@ void pandecode_abort_on_fault_v9(mali_ptr jc_gpu_va); static inline void pan_hexdump(FILE *fp, const uint8_t *hex, size_t cnt, bool with_strings) { - for (unsigned i = 0; i < cnt; ++i) { - if ((i & 0xF) == 0) - fprintf(fp, "%06X ", i); + for (unsigned i = 0; i < cnt; ++i) { + if ((i & 0xF) == 0) + fprintf(fp, "%06X ", i); - uint8_t v = hex[i]; + uint8_t v = hex[i]; - if (v == 0 && (i & 0xF) == 0) { - /* Check if we're starting an aligned run of zeroes */ - unsigned zero_count = 0; + if (v == 0 && (i & 0xF) == 0) { + /* Check if we're starting an aligned run of zeroes */ + unsigned zero_count = 0; - for (unsigned j = i; j < cnt; ++j) { - if (hex[j] == 0) - zero_count++; - else - break; - } + for (unsigned j = i; j < cnt; ++j) { + if (hex[j] == 0) + zero_count++; + else + break; + } - if (zero_count >= 32) { - fprintf(fp, "*\n"); - i += (zero_count & ~0xF) - 1; - continue; - } - } + if (zero_count >= 32) { + fprintf(fp, "*\n"); + i += (zero_count & ~0xF) - 1; + continue; + } + } - fprintf(fp, "%02X ", hex[i]); - if ((i & 0xF) == 0xF && with_strings) { - fprintf(fp, " | "); - for (unsigned j = i & ~0xF; j <= i; ++j) { - uint8_t c = hex[j]; - fputc((c < 32 || c > 128) ? '.' : c, fp); - } - } + fprintf(fp, "%02X ", hex[i]); + if ((i & 0xF) == 0xF && with_strings) { + fprintf(fp, " | "); + for (unsigned j = i & ~0xF; j <= i; ++j) { + uint8_t c = hex[j]; + fputc((c < 32 || c > 128) ? '.' : c, fp); + } + } - if ((i & 0xF) == 0xF) - fprintf(fp, "\n"); - } + if ((i & 0xF) == 0xF) + fprintf(fp, "\n"); + } - fprintf(fp, "\n"); + fprintf(fp, "\n"); } #endif /* __MMAP_TRACE_H__ */ diff --git a/src/panfrost/lib/genxml/decode_common.c b/src/panfrost/lib/genxml/decode_common.c index ecc02387175..76cec531ed9 100644 --- a/src/panfrost/lib/genxml/decode_common.c +++ b/src/panfrost/lib/genxml/decode_common.c @@ -23,18 +23,18 @@ * SOFTWARE. */ -#include -#include #include #include +#include +#include #include #include -#include "decode.h" #include "util/macros.h" +#include "util/simple_mtx.h" #include "util/u_debug.h" #include "util/u_dynarray.h" -#include "util/simple_mtx.h" +#include "decode.h" FILE *pandecode_dump_stream; @@ -46,8 +46,8 @@ static struct util_dynarray ro_mappings; static simple_mtx_t pandecode_lock = SIMPLE_MTX_INITIALIZER; -#define to_mapped_memory(x) \ - rb_node_data(struct pandecode_mapped_memory, x, node) +#define to_mapped_memory(x) \ + rb_node_data(struct pandecode_mapped_memory, x, node) /* * Compare a GPU VA to a node, considering a GPU VA to be equal to a node if it @@ -57,147 +57,147 @@ static simple_mtx_t pandecode_lock = SIMPLE_MTX_INITIALIZER; static int pandecode_cmp_key(const struct rb_node *lhs, const void *key) { - struct pandecode_mapped_memory *mem = to_mapped_memory(lhs); - uint64_t *gpu_va = (uint64_t *) key; + struct pandecode_mapped_memory *mem = to_mapped_memory(lhs); + uint64_t *gpu_va = (uint64_t *)key; - if (mem->gpu_va <= *gpu_va && *gpu_va < (mem->gpu_va + mem->length)) - return 0; - else - return mem->gpu_va - *gpu_va; + if (mem->gpu_va <= *gpu_va && *gpu_va < (mem->gpu_va + mem->length)) + return 0; + else + return mem->gpu_va - *gpu_va; } static int pandecode_cmp(const struct rb_node *lhs, const struct rb_node *rhs) { - return to_mapped_memory(lhs)->gpu_va - to_mapped_memory(rhs)->gpu_va; + return to_mapped_memory(lhs)->gpu_va - to_mapped_memory(rhs)->gpu_va; } static struct pandecode_mapped_memory * pandecode_find_mapped_gpu_mem_containing_rw(uint64_t addr) { - simple_mtx_assert_locked(&pandecode_lock); + simple_mtx_assert_locked(&pandecode_lock); - struct rb_node *node = rb_tree_search(&mmap_tree, &addr, pandecode_cmp_key); + struct rb_node *node = rb_tree_search(&mmap_tree, &addr, pandecode_cmp_key); - return to_mapped_memory(node); + return to_mapped_memory(node); } struct pandecode_mapped_memory * pandecode_find_mapped_gpu_mem_containing(uint64_t addr) { - simple_mtx_assert_locked(&pandecode_lock); + simple_mtx_assert_locked(&pandecode_lock); - struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing_rw(addr); + struct pandecode_mapped_memory *mem = + pandecode_find_mapped_gpu_mem_containing_rw(addr); - if (mem && mem->addr && !mem->ro) { - mprotect(mem->addr, mem->length, PROT_READ); - mem->ro = true; - util_dynarray_append(&ro_mappings, struct pandecode_mapped_memory *, mem); - } + if (mem && mem->addr && !mem->ro) { + mprotect(mem->addr, mem->length, PROT_READ); + mem->ro = true; + util_dynarray_append(&ro_mappings, struct pandecode_mapped_memory *, mem); + } - return mem; + return mem; } void pandecode_map_read_write(void) { - simple_mtx_assert_locked(&pandecode_lock); + simple_mtx_assert_locked(&pandecode_lock); - util_dynarray_foreach(&ro_mappings, struct pandecode_mapped_memory *, mem) { - (*mem)->ro = false; - mprotect((*mem)->addr, (*mem)->length, PROT_READ | PROT_WRITE); - } - util_dynarray_clear(&ro_mappings); + util_dynarray_foreach(&ro_mappings, struct pandecode_mapped_memory *, mem) { + (*mem)->ro = false; + mprotect((*mem)->addr, (*mem)->length, PROT_READ | PROT_WRITE); + } + util_dynarray_clear(&ro_mappings); } static void -pandecode_add_name(struct pandecode_mapped_memory *mem, uint64_t gpu_va, const char *name) +pandecode_add_name(struct pandecode_mapped_memory *mem, uint64_t gpu_va, + const char *name) { - simple_mtx_assert_locked(&pandecode_lock); + simple_mtx_assert_locked(&pandecode_lock); - if (!name) { - /* If we don't have a name, assign one */ + if (!name) { + /* If we don't have a name, assign one */ - snprintf(mem->name, sizeof(mem->name) - 1, - "memory_%" PRIx64, gpu_va); - } else { - assert((strlen(name) + 1) < sizeof(mem->name)); - memcpy(mem->name, name, strlen(name) + 1); - } + snprintf(mem->name, sizeof(mem->name) - 1, "memory_%" PRIx64, gpu_va); + } else { + assert((strlen(name) + 1) < sizeof(mem->name)); + memcpy(mem->name, name, strlen(name) + 1); + } } void pandecode_inject_mmap(uint64_t gpu_va, void *cpu, unsigned sz, const char *name) { - simple_mtx_lock(&pandecode_lock); + simple_mtx_lock(&pandecode_lock); - /* First, search if we already mapped this and are just updating an address */ + /* First, search if we already mapped this and are just updating an address */ - struct pandecode_mapped_memory *existing = - pandecode_find_mapped_gpu_mem_containing_rw(gpu_va); + struct pandecode_mapped_memory *existing = + pandecode_find_mapped_gpu_mem_containing_rw(gpu_va); - if (existing && existing->gpu_va == gpu_va) { - existing->length = sz; - existing->addr = cpu; - pandecode_add_name(existing, gpu_va, name); - } else { - /* Otherwise, add a fresh mapping */ - struct pandecode_mapped_memory *mapped_mem = NULL; + if (existing && existing->gpu_va == gpu_va) { + existing->length = sz; + existing->addr = cpu; + pandecode_add_name(existing, gpu_va, name); + } else { + /* Otherwise, add a fresh mapping */ + struct pandecode_mapped_memory *mapped_mem = NULL; - mapped_mem = calloc(1, sizeof(*mapped_mem)); - mapped_mem->gpu_va = gpu_va; - mapped_mem->length = sz; - mapped_mem->addr = cpu; - pandecode_add_name(mapped_mem, gpu_va, name); + mapped_mem = calloc(1, sizeof(*mapped_mem)); + mapped_mem->gpu_va = gpu_va; + mapped_mem->length = sz; + mapped_mem->addr = cpu; + pandecode_add_name(mapped_mem, gpu_va, name); - /* Add it to the tree */ - rb_tree_insert(&mmap_tree, &mapped_mem->node, pandecode_cmp); - } + /* Add it to the tree */ + rb_tree_insert(&mmap_tree, &mapped_mem->node, pandecode_cmp); + } - simple_mtx_unlock(&pandecode_lock); + simple_mtx_unlock(&pandecode_lock); } void pandecode_inject_free(uint64_t gpu_va, unsigned sz) { - simple_mtx_lock(&pandecode_lock); + simple_mtx_lock(&pandecode_lock); - struct pandecode_mapped_memory *mem = - pandecode_find_mapped_gpu_mem_containing_rw(gpu_va); + struct pandecode_mapped_memory *mem = + pandecode_find_mapped_gpu_mem_containing_rw(gpu_va); - if (mem) { - assert(mem->gpu_va == gpu_va); - assert(mem->length == sz); + if (mem) { + assert(mem->gpu_va == gpu_va); + assert(mem->length == sz); - rb_tree_remove(&mmap_tree, &mem->node); - free(mem); - } + rb_tree_remove(&mmap_tree, &mem->node); + free(mem); + } - simple_mtx_unlock(&pandecode_lock); + simple_mtx_unlock(&pandecode_lock); } char * pointer_as_memory_reference(uint64_t ptr) { - simple_mtx_assert_locked(&pandecode_lock); + simple_mtx_assert_locked(&pandecode_lock); - struct pandecode_mapped_memory *mapped; - char *out = malloc(128); + struct pandecode_mapped_memory *mapped; + char *out = malloc(128); - /* Try to find the corresponding mapped zone */ + /* Try to find the corresponding mapped zone */ - mapped = pandecode_find_mapped_gpu_mem_containing_rw(ptr); + mapped = pandecode_find_mapped_gpu_mem_containing_rw(ptr); - if (mapped) { - snprintf(out, 128, "%s + %d", mapped->name, (int) (ptr - mapped->gpu_va)); - return out; - } + if (mapped) { + snprintf(out, 128, "%s + %d", mapped->name, (int)(ptr - mapped->gpu_va)); + return out; + } - /* Just use the raw address if other options are exhausted */ - - snprintf(out, 128, "0x%" PRIx64, ptr); - return out; + /* Just use the raw address if other options are exhausted */ + snprintf(out, 128, "0x%" PRIx64, ptr); + return out; } static int pandecode_dump_frame_count = 0; @@ -207,129 +207,153 @@ static bool force_stderr = false; void pandecode_dump_file_open(void) { - simple_mtx_assert_locked(&pandecode_lock); + simple_mtx_assert_locked(&pandecode_lock); - if (pandecode_dump_stream) - return; + if (pandecode_dump_stream) + return; - /* This does a getenv every frame, so it is possible to use - * setenv to change the base at runtime. - */ - const char *dump_file_base = debug_get_option("PANDECODE_DUMP_FILE", "pandecode.dump"); - if (force_stderr || !strcmp(dump_file_base, "stderr")) - pandecode_dump_stream = stderr; - else { - char buffer[1024]; - snprintf(buffer, sizeof(buffer), "%s.%04d", dump_file_base, pandecode_dump_frame_count); - printf("pandecode: dump command stream to file %s\n", buffer); - pandecode_dump_stream = fopen(buffer, "w"); - if (!pandecode_dump_stream) - fprintf(stderr, - "pandecode: failed to open command stream log file %s\n", - buffer); - } + /* This does a getenv every frame, so it is possible to use + * setenv to change the base at runtime. + */ + const char *dump_file_base = + debug_get_option("PANDECODE_DUMP_FILE", "pandecode.dump"); + if (force_stderr || !strcmp(dump_file_base, "stderr")) + pandecode_dump_stream = stderr; + else { + char buffer[1024]; + snprintf(buffer, sizeof(buffer), "%s.%04d", dump_file_base, + pandecode_dump_frame_count); + printf("pandecode: dump command stream to file %s\n", buffer); + pandecode_dump_stream = fopen(buffer, "w"); + if (!pandecode_dump_stream) + fprintf(stderr, + "pandecode: failed to open command stream log file %s\n", + buffer); + } } static void pandecode_dump_file_close(void) { - simple_mtx_assert_locked(&pandecode_lock); + simple_mtx_assert_locked(&pandecode_lock); - if (pandecode_dump_stream && pandecode_dump_stream != stderr) { - if (fclose(pandecode_dump_stream)) - perror("pandecode: dump file"); + if (pandecode_dump_stream && pandecode_dump_stream != stderr) { + if (fclose(pandecode_dump_stream)) + perror("pandecode: dump file"); - pandecode_dump_stream = NULL; - } + pandecode_dump_stream = NULL; + } } void pandecode_initialize(bool to_stderr) { - force_stderr = to_stderr; - rb_tree_init(&mmap_tree); - util_dynarray_init(&ro_mappings, NULL); + force_stderr = to_stderr; + rb_tree_init(&mmap_tree); + util_dynarray_init(&ro_mappings, NULL); } void pandecode_next_frame(void) { - simple_mtx_lock(&pandecode_lock); + simple_mtx_lock(&pandecode_lock); - pandecode_dump_file_close(); - pandecode_dump_frame_count++; + pandecode_dump_file_close(); + pandecode_dump_frame_count++; - simple_mtx_unlock(&pandecode_lock); + simple_mtx_unlock(&pandecode_lock); } void pandecode_close(void) { - simple_mtx_lock(&pandecode_lock); + simple_mtx_lock(&pandecode_lock); - rb_tree_foreach_safe(struct pandecode_mapped_memory, it, &mmap_tree, node) { - rb_tree_remove(&mmap_tree, &it->node); - free(it); - } + rb_tree_foreach_safe(struct pandecode_mapped_memory, it, &mmap_tree, node) { + rb_tree_remove(&mmap_tree, &it->node); + free(it); + } - util_dynarray_fini(&ro_mappings); - pandecode_dump_file_close(); + util_dynarray_fini(&ro_mappings); + pandecode_dump_file_close(); - simple_mtx_unlock(&pandecode_lock); + simple_mtx_unlock(&pandecode_lock); } void pandecode_dump_mappings(void) { - simple_mtx_lock(&pandecode_lock); + simple_mtx_lock(&pandecode_lock); - pandecode_dump_file_open(); + pandecode_dump_file_open(); - rb_tree_foreach(struct pandecode_mapped_memory, it, &mmap_tree, node) { - if (!it->addr || !it->length) - continue; + rb_tree_foreach(struct pandecode_mapped_memory, it, &mmap_tree, node) { + if (!it->addr || !it->length) + continue; - fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64 "\n\n", - it->name, it->gpu_va); + fprintf(pandecode_dump_stream, "Buffer: %s gpu %" PRIx64 "\n\n", it->name, + it->gpu_va); - pan_hexdump(pandecode_dump_stream, it->addr, it->length, false); - fprintf(pandecode_dump_stream, "\n"); - } + pan_hexdump(pandecode_dump_stream, it->addr, it->length, false); + fprintf(pandecode_dump_stream, "\n"); + } - fflush(pandecode_dump_stream); - simple_mtx_unlock(&pandecode_lock); + fflush(pandecode_dump_stream); + simple_mtx_unlock(&pandecode_lock); } void pandecode_abort_on_fault(mali_ptr jc_gpu_va, unsigned gpu_id) { - simple_mtx_lock(&pandecode_lock); + simple_mtx_lock(&pandecode_lock); - switch (pan_arch(gpu_id)) { - case 4: pandecode_abort_on_fault_v4(jc_gpu_va); break; - case 5: pandecode_abort_on_fault_v5(jc_gpu_va); break; - case 6: pandecode_abort_on_fault_v6(jc_gpu_va); break; - case 7: pandecode_abort_on_fault_v7(jc_gpu_va); break; - case 9: pandecode_abort_on_fault_v9(jc_gpu_va); break; - default: unreachable("Unsupported architecture"); - } + switch (pan_arch(gpu_id)) { + case 4: + pandecode_abort_on_fault_v4(jc_gpu_va); + break; + case 5: + pandecode_abort_on_fault_v5(jc_gpu_va); + break; + case 6: + pandecode_abort_on_fault_v6(jc_gpu_va); + break; + case 7: + pandecode_abort_on_fault_v7(jc_gpu_va); + break; + case 9: + pandecode_abort_on_fault_v9(jc_gpu_va); + break; + default: + unreachable("Unsupported architecture"); + } - simple_mtx_unlock(&pandecode_lock); + simple_mtx_unlock(&pandecode_lock); } void pandecode_jc(mali_ptr jc_gpu_va, unsigned gpu_id) { - simple_mtx_lock(&pandecode_lock); + simple_mtx_lock(&pandecode_lock); - switch (pan_arch(gpu_id)) { - case 4: pandecode_jc_v4(jc_gpu_va, gpu_id); break; - case 5: pandecode_jc_v5(jc_gpu_va, gpu_id); break; - case 6: pandecode_jc_v6(jc_gpu_va, gpu_id); break; - case 7: pandecode_jc_v7(jc_gpu_va, gpu_id); break; - case 9: pandecode_jc_v9(jc_gpu_va, gpu_id); break; - default: unreachable("Unsupported architecture"); - } + switch (pan_arch(gpu_id)) { + case 4: + pandecode_jc_v4(jc_gpu_va, gpu_id); + break; + case 5: + pandecode_jc_v5(jc_gpu_va, gpu_id); + break; + case 6: + pandecode_jc_v6(jc_gpu_va, gpu_id); + break; + case 7: + pandecode_jc_v7(jc_gpu_va, gpu_id); + break; + case 9: + pandecode_jc_v9(jc_gpu_va, gpu_id); + break; + default: + unreachable("Unsupported architecture"); + } - simple_mtx_unlock(&pandecode_lock); + simple_mtx_unlock(&pandecode_lock); } diff --git a/src/panfrost/lib/genxml/gen_macros.h b/src/panfrost/lib/genxml/gen_macros.h index 1ef4b53a508..b15f52c4181 100644 --- a/src/panfrost/lib/genxml/gen_macros.h +++ b/src/panfrost/lib/genxml/gen_macros.h @@ -56,45 +56,45 @@ static inline unsigned pan_arch(unsigned gpu_id) { - switch (gpu_id) { - case 0x600: - case 0x620: - case 0x720: - return 4; - case 0x750: - case 0x820: - case 0x830: - case 0x860: - case 0x880: - return 5; - default: - return gpu_id >> 12; - } + switch (gpu_id) { + case 0x600: + case 0x620: + case 0x720: + return 4; + case 0x750: + case 0x820: + case 0x830: + case 0x860: + case 0x880: + return 5; + default: + return gpu_id >> 12; + } } /* Base macro defined on the command line. */ #ifndef PAN_ARCH -# include "genxml/common_pack.h" +#include "genxml/common_pack.h" #else /* Suffixing macros */ #if (PAN_ARCH == 4) -# define GENX(X) X##_v4 -# include "genxml/v4_pack.h" +#define GENX(X) X##_v4 +#include "genxml/v4_pack.h" #elif (PAN_ARCH == 5) -# define GENX(X) X##_v5 -# include "genxml/v5_pack.h" +#define GENX(X) X##_v5 +#include "genxml/v5_pack.h" #elif (PAN_ARCH == 6) -# define GENX(X) X##_v6 -# include "genxml/v6_pack.h" +#define GENX(X) X##_v6 +#include "genxml/v6_pack.h" #elif (PAN_ARCH == 7) -# define GENX(X) X##_v7 -# include "genxml/v7_pack.h" +#define GENX(X) X##_v7 +#include "genxml/v7_pack.h" #elif (PAN_ARCH == 9) -# define GENX(X) X##_v9 -# include "genxml/v9_pack.h" +#define GENX(X) X##_v9 +#include "genxml/v9_pack.h" #else -# error "Need to add suffixing macro for this architecture" +#error "Need to add suffixing macro for this architecture" #endif #endif /* PAN_ARCH */ diff --git a/src/panfrost/lib/pan_afbc.c b/src/panfrost/lib/pan_afbc.c index 151725ded90..97ff6dc70e5 100644 --- a/src/panfrost/lib/pan_afbc.c +++ b/src/panfrost/lib/pan_afbc.c @@ -50,8 +50,8 @@ * must also be cache-line aligned, so there can sometimes be a bit of padding * between the header and body. * - * As an example, a 64x64 RGBA framebuffer contains 64/16 = 4 tiles horizontally and - * 4 tiles vertically. There are 4*4=16 tiles in total, each containing 16 + * As an example, a 64x64 RGBA framebuffer contains 64/16 = 4 tiles horizontally + * and 4 tiles vertically. There are 4*4=16 tiles in total, each containing 16 * bytes of metadata, so there is a 16*16=256 byte header. 64x64 is already * tile aligned, so the body is 64*64 * 4 bytes per pixel = 16384 bytes of * body. @@ -69,45 +69,45 @@ static enum pipe_format unswizzled_format(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_I8_UNORM: - return PIPE_FORMAT_R8_UNORM; + switch (format) { + case PIPE_FORMAT_A8_UNORM: + case PIPE_FORMAT_L8_UNORM: + case PIPE_FORMAT_I8_UNORM: + return PIPE_FORMAT_R8_UNORM; - case PIPE_FORMAT_L8A8_UNORM: - return PIPE_FORMAT_R8G8_UNORM; + case PIPE_FORMAT_L8A8_UNORM: + return PIPE_FORMAT_R8G8_UNORM; - case PIPE_FORMAT_B8G8R8_UNORM: - return PIPE_FORMAT_R8G8B8_UNORM; + case PIPE_FORMAT_B8G8R8_UNORM: + return PIPE_FORMAT_R8G8B8_UNORM; - case PIPE_FORMAT_R8G8B8X8_UNORM: - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_A8R8G8B8_UNORM: - case PIPE_FORMAT_X8R8G8B8_UNORM: - case PIPE_FORMAT_X8B8G8R8_UNORM: - case PIPE_FORMAT_A8B8G8R8_UNORM: - return PIPE_FORMAT_R8G8B8A8_UNORM; + case PIPE_FORMAT_R8G8B8X8_UNORM: + case PIPE_FORMAT_B8G8R8A8_UNORM: + case PIPE_FORMAT_B8G8R8X8_UNORM: + case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_X8R8G8B8_UNORM: + case PIPE_FORMAT_X8B8G8R8_UNORM: + case PIPE_FORMAT_A8B8G8R8_UNORM: + return PIPE_FORMAT_R8G8B8A8_UNORM; - case PIPE_FORMAT_B5G6R5_UNORM: - return PIPE_FORMAT_R5G6B5_UNORM; + case PIPE_FORMAT_B5G6R5_UNORM: + return PIPE_FORMAT_R5G6B5_UNORM; - case PIPE_FORMAT_B5G5R5A1_UNORM: - return PIPE_FORMAT_R5G5B5A1_UNORM; + case PIPE_FORMAT_B5G5R5A1_UNORM: + return PIPE_FORMAT_R5G5B5A1_UNORM; - case PIPE_FORMAT_R10G10B10X2_UNORM: - case PIPE_FORMAT_B10G10R10A2_UNORM: - case PIPE_FORMAT_B10G10R10X2_UNORM: - return PIPE_FORMAT_R10G10B10A2_UNORM; + case PIPE_FORMAT_R10G10B10X2_UNORM: + case PIPE_FORMAT_B10G10R10A2_UNORM: + case PIPE_FORMAT_B10G10R10X2_UNORM: + return PIPE_FORMAT_R10G10B10A2_UNORM; - case PIPE_FORMAT_A4B4G4R4_UNORM: - case PIPE_FORMAT_B4G4R4A4_UNORM: - return PIPE_FORMAT_R4G4B4A4_UNORM; + case PIPE_FORMAT_A4B4G4R4_UNORM: + case PIPE_FORMAT_B4G4R4A4_UNORM: + return PIPE_FORMAT_R4G4B4A4_UNORM; - default: - return format; - } + default: + return format; + } } /* AFBC supports compressing a few canonical formats. Additional formats are @@ -118,29 +118,29 @@ unswizzled_format(enum pipe_format format) enum pan_afbc_mode panfrost_afbc_format(unsigned arch, enum pipe_format format) { - /* Luminance-alpha not supported for AFBC on v7+ */ - switch (format) { - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_I8_UNORM: - case PIPE_FORMAT_L8A8_UNORM: - if (arch >= 7) - return PAN_AFBC_MODE_INVALID; - else - break; - default: - break; - } + /* Luminance-alpha not supported for AFBC on v7+ */ + switch (format) { + case PIPE_FORMAT_A8_UNORM: + case PIPE_FORMAT_L8_UNORM: + case PIPE_FORMAT_I8_UNORM: + case PIPE_FORMAT_L8A8_UNORM: + if (arch >= 7) + return PAN_AFBC_MODE_INVALID; + else + break; + default: + break; + } - /* sRGB does not change the pixel format itself, only the - * interpretation. The interpretation is handled by conversion hardware - * independent to the compression hardware, so we can compress sRGB - * formats by using the corresponding linear format. - */ - format = util_format_linear(format); + /* sRGB does not change the pixel format itself, only the + * interpretation. The interpretation is handled by conversion hardware + * independent to the compression hardware, so we can compress sRGB + * formats by using the corresponding linear format. + */ + format = util_format_linear(format); - /* We handle swizzling orthogonally to AFBC */ - format = unswizzled_format(format); + /* We handle swizzling orthogonally to AFBC */ + format = unswizzled_format(format); /* clang-format off */ switch (format) { @@ -166,9 +166,10 @@ panfrost_afbc_format(unsigned arch, enum pipe_format format) /* A format may be compressed as AFBC if it has an AFBC internal format */ bool -panfrost_format_supports_afbc(const struct panfrost_device *dev, enum pipe_format format) +panfrost_format_supports_afbc(const struct panfrost_device *dev, + enum pipe_format format) { - return panfrost_afbc_format(dev->arch, format) != PAN_AFBC_MODE_INVALID; + return panfrost_afbc_format(dev->arch, format) != PAN_AFBC_MODE_INVALID; } /* The lossless colour transform (AFBC_FORMAT_MOD_YTR) requires RGB. */ @@ -176,15 +177,14 @@ panfrost_format_supports_afbc(const struct panfrost_device *dev, enum pipe_forma bool panfrost_afbc_can_ytr(enum pipe_format format) { - const struct util_format_description *desc = - util_format_description(format); + const struct util_format_description *desc = util_format_description(format); - /* YTR is only defined for RGB(A) */ - if (desc->nr_channels != 3 && desc->nr_channels != 4) - return false; + /* YTR is only defined for RGB(A) */ + if (desc->nr_channels != 3 && desc->nr_channels != 4) + return false; - /* The fourth channel if it exists doesn't matter */ - return desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB; + /* The fourth channel if it exists doesn't matter */ + return desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB; } /* @@ -194,5 +194,5 @@ panfrost_afbc_can_ytr(enum pipe_format format) bool panfrost_afbc_can_tile(const struct panfrost_device *dev) { - return (dev->arch >= 7); + return (dev->arch >= 7); } diff --git a/src/panfrost/lib/pan_attributes.c b/src/panfrost/lib/pan_attributes.c index 5c57f050dda..b1669bf8d65 100644 --- a/src/panfrost/lib/pan_attributes.c +++ b/src/panfrost/lib/pan_attributes.c @@ -39,91 +39,92 @@ static unsigned panfrost_small_padded_vertex_count(unsigned idx) { - if (idx < 10) - return idx; - else - return (idx + 1) & ~1; + if (idx < 10) + return idx; + else + return (idx + 1) & ~1; } static unsigned panfrost_large_padded_vertex_count(uint32_t vertex_count) { - /* First, we have to find the highest set one */ - unsigned highest = 32 - __builtin_clz(vertex_count); + /* First, we have to find the highest set one */ + unsigned highest = 32 - __builtin_clz(vertex_count); - /* Using that, we mask out the highest 4-bits */ - unsigned n = highest - 4; - unsigned nibble = (vertex_count >> n) & 0xF; + /* Using that, we mask out the highest 4-bits */ + unsigned n = highest - 4; + unsigned nibble = (vertex_count >> n) & 0xF; - /* Great, we have the nibble. Now we can just try possibilities. Note - * that we don't care about the bottom most bit in most cases, and we - * know the top bit must be 1 */ + /* Great, we have the nibble. Now we can just try possibilities. Note + * that we don't care about the bottom most bit in most cases, and we + * know the top bit must be 1 */ - unsigned middle_two = (nibble >> 1) & 0x3; + unsigned middle_two = (nibble >> 1) & 0x3; - switch (middle_two) { - case 0b00: - if (!(nibble & 1)) - return (1 << n) * 9; - else - return (1 << (n + 1)) * 5; - case 0b01: - return (1 << (n + 2)) * 3; - case 0b10: - return (1 << (n + 1)) * 7; - case 0b11: - return (1 << (n + 4)); - default: - return 0; /* unreachable */ - } + switch (middle_two) { + case 0b00: + if (!(nibble & 1)) + return (1 << n) * 9; + else + return (1 << (n + 1)) * 5; + case 0b01: + return (1 << (n + 2)) * 3; + case 0b10: + return (1 << (n + 1)) * 7; + case 0b11: + return (1 << (n + 4)); + default: + return 0; /* unreachable */ + } } unsigned panfrost_padded_vertex_count(unsigned vertex_count) { - if (vertex_count < 20) - return panfrost_small_padded_vertex_count(vertex_count); - else - return panfrost_large_padded_vertex_count(vertex_count); + if (vertex_count < 20) + return panfrost_small_padded_vertex_count(vertex_count); + else + return panfrost_large_padded_vertex_count(vertex_count); } /* The much, much more irritating case -- instancing is enabled. See * panfrost_job.h for notes on how this works */ unsigned -panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags) +panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, + unsigned *extra_flags) { - /* We have a NPOT divisor. Here's the fun one (multipling by - * the inverse and shifting) */ + /* We have a NPOT divisor. Here's the fun one (multipling by + * the inverse and shifting) */ - /* floor(log2(d)) */ - unsigned shift = util_logbase2(hw_divisor); + /* floor(log2(d)) */ + unsigned shift = util_logbase2(hw_divisor); - /* m = ceil(2^(32 + shift) / d) */ - uint64_t shift_hi = 32 + shift; - uint64_t t = 1ll << shift_hi; - double t_f = t; - double hw_divisor_d = hw_divisor; - double m_f = ceil(t_f / hw_divisor_d); - unsigned m = m_f; + /* m = ceil(2^(32 + shift) / d) */ + uint64_t shift_hi = 32 + shift; + uint64_t t = 1ll << shift_hi; + double t_f = t; + double hw_divisor_d = hw_divisor; + double m_f = ceil(t_f / hw_divisor_d); + unsigned m = m_f; - /* Default case */ - uint32_t magic_divisor = m; + /* Default case */ + uint32_t magic_divisor = m; - /* e = 2^(shift + 32) % d */ - uint64_t e = t % hw_divisor; + /* e = 2^(shift + 32) % d */ + uint64_t e = t % hw_divisor; - /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob - * seems to use a different condition */ - if (e <= (1ll << shift)) { - magic_divisor = m - 1; - *extra_flags = 1; - } + /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob + * seems to use a different condition */ + if (e <= (1ll << shift)) { + magic_divisor = m - 1; + *extra_flags = 1; + } - /* Top flag implicitly set */ - assert(magic_divisor & (1u << 31)); - magic_divisor &= ~(1u << 31); - *o_shift = shift; + /* Top flag implicitly set */ + assert(magic_divisor & (1u << 31)); + magic_divisor &= ~(1u << 31); + *o_shift = shift; - return magic_divisor; + return magic_divisor; } diff --git a/src/panfrost/lib/pan_blend.c b/src/panfrost/lib/pan_blend.c index e5673a0ead8..768f49f720e 100644 --- a/src/panfrost/lib/pan_blend.c +++ b/src/panfrost/lib/pan_blend.c @@ -28,13 +28,13 @@ #include "pan_shader.h" #endif -#include "pan_texture.h" -#include "panfrost/util/pan_lower_framebuffer.h" -#include "util/format/u_format.h" #include "compiler/nir/nir.h" #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_conversion_builder.h" #include "compiler/nir/nir_lower_blend.h" +#include "panfrost/util/pan_lower_framebuffer.h" +#include "util/format/u_format.h" +#include "pan_texture.h" #ifndef PAN_ARCH @@ -43,9 +43,9 @@ static bool factor_is_supported(enum blend_factor factor) { - return factor != BLEND_FACTOR_SRC_ALPHA_SATURATE && - factor != BLEND_FACTOR_SRC1_COLOR && - factor != BLEND_FACTOR_SRC1_ALPHA; + return factor != BLEND_FACTOR_SRC_ALPHA_SATURATE && + factor != BLEND_FACTOR_SRC1_COLOR && + factor != BLEND_FACTOR_SRC1_ALPHA; } /* OpenGL allows encoding (src*dest + dest*src) which is incompatiblle with @@ -54,71 +54,62 @@ factor_is_supported(enum blend_factor factor) * + dest * (2*src) wih the new source_2 value of C. Detect this case. */ static bool -is_2srcdest(enum blend_func blend_func, - enum blend_factor src_factor, - bool invert_src, - enum blend_factor dest_factor, - bool invert_dest, +is_2srcdest(enum blend_func blend_func, enum blend_factor src_factor, + bool invert_src, enum blend_factor dest_factor, bool invert_dest, bool is_alpha) { - return (blend_func == BLEND_FUNC_ADD) && - ((src_factor == BLEND_FACTOR_DST_COLOR) || - ((src_factor == BLEND_FACTOR_DST_ALPHA) && is_alpha)) && - ((dest_factor == BLEND_FACTOR_SRC_COLOR) || - ((dest_factor == BLEND_FACTOR_SRC_ALPHA) && is_alpha)) && - !invert_src && !invert_dest; + return (blend_func == BLEND_FUNC_ADD) && + ((src_factor == BLEND_FACTOR_DST_COLOR) || + ((src_factor == BLEND_FACTOR_DST_ALPHA) && is_alpha)) && + ((dest_factor == BLEND_FACTOR_SRC_COLOR) || + ((dest_factor == BLEND_FACTOR_SRC_ALPHA) && is_alpha)) && + !invert_src && !invert_dest; } static bool can_fixed_function_equation(enum blend_func blend_func, - enum blend_factor src_factor, - bool invert_src, - enum blend_factor dest_factor, - bool invert_dest, - bool is_alpha, - bool supports_2src) + enum blend_factor src_factor, bool invert_src, + enum blend_factor dest_factor, bool invert_dest, + bool is_alpha, bool supports_2src) { - if (is_2srcdest(blend_func, src_factor, invert_src, - dest_factor, invert_dest, is_alpha)) { + if (is_2srcdest(blend_func, src_factor, invert_src, dest_factor, invert_dest, + is_alpha)) { - return supports_2src; - } + return supports_2src; + } - if (blend_func != BLEND_FUNC_ADD && - blend_func != BLEND_FUNC_SUBTRACT && - blend_func != BLEND_FUNC_REVERSE_SUBTRACT) - return false; + if (blend_func != BLEND_FUNC_ADD && blend_func != BLEND_FUNC_SUBTRACT && + blend_func != BLEND_FUNC_REVERSE_SUBTRACT) + return false; - if (!factor_is_supported(src_factor) || - !factor_is_supported(dest_factor)) - return false; + if (!factor_is_supported(src_factor) || !factor_is_supported(dest_factor)) + return false; - if (src_factor != dest_factor && - src_factor != BLEND_FACTOR_ZERO && - dest_factor != BLEND_FACTOR_ZERO) - return false; + if (src_factor != dest_factor && src_factor != BLEND_FACTOR_ZERO && + dest_factor != BLEND_FACTOR_ZERO) + return false; - return true; + return true; } static unsigned blend_factor_constant_mask(enum blend_factor factor) { - if (factor == BLEND_FACTOR_CONSTANT_COLOR) - return 0b0111; /* RGB */ - else if (factor == BLEND_FACTOR_CONSTANT_ALPHA) - return 0b1000; /* A */ - else - return 0b0000; /* - */ + if (factor == BLEND_FACTOR_CONSTANT_COLOR) + return 0b0111; /* RGB */ + else if (factor == BLEND_FACTOR_CONSTANT_ALPHA) + return 0b1000; /* A */ + else + return 0b0000; /* - */ } unsigned pan_blend_constant_mask(const struct pan_blend_equation eq) { - return blend_factor_constant_mask(eq.rgb_src_factor) | - blend_factor_constant_mask(eq.rgb_dst_factor) | - blend_factor_constant_mask(eq.alpha_src_factor) | - blend_factor_constant_mask(eq.alpha_dst_factor); + return blend_factor_constant_mask(eq.rgb_src_factor) | + blend_factor_constant_mask(eq.rgb_dst_factor) | + blend_factor_constant_mask(eq.alpha_src_factor) | + blend_factor_constant_mask(eq.alpha_dst_factor); } /* Only "homogenous" (scalar or vector with all components equal) constants are @@ -127,14 +118,14 @@ pan_blend_constant_mask(const struct pan_blend_equation eq) bool pan_blend_is_homogenous_constant(unsigned mask, const float *constants) { - float constant = pan_blend_get_constant(mask, constants); + float constant = pan_blend_get_constant(mask, constants); - u_foreach_bit(i, mask) { - if (constants[i] != constant) - return false; - } + u_foreach_bit(i, mask) { + if (constants[i] != constant) + return false; + } - return true; + return true; } /* Determines if an equation can run in fixed function */ @@ -143,167 +134,161 @@ bool pan_blend_can_fixed_function(const struct pan_blend_equation equation, bool supports_2src) { - return !equation.blend_enable || - (can_fixed_function_equation(equation.rgb_func, - equation.rgb_src_factor, - equation.rgb_invert_src_factor, - equation.rgb_dst_factor, - equation.rgb_invert_dst_factor, - false, supports_2src) && - can_fixed_function_equation(equation.alpha_func, - equation.alpha_src_factor, - equation.alpha_invert_src_factor, - equation.alpha_dst_factor, - equation.alpha_invert_dst_factor, - true, supports_2src)); + return !equation.blend_enable || + (can_fixed_function_equation( + equation.rgb_func, equation.rgb_src_factor, + equation.rgb_invert_src_factor, equation.rgb_dst_factor, + equation.rgb_invert_dst_factor, false, supports_2src) && + can_fixed_function_equation( + equation.alpha_func, equation.alpha_src_factor, + equation.alpha_invert_src_factor, equation.alpha_dst_factor, + equation.alpha_invert_dst_factor, true, supports_2src)); } static enum mali_blend_operand_c to_c_factor(enum blend_factor factor) { - switch (factor) { - case BLEND_FACTOR_ZERO: - return MALI_BLEND_OPERAND_C_ZERO; + switch (factor) { + case BLEND_FACTOR_ZERO: + return MALI_BLEND_OPERAND_C_ZERO; - case BLEND_FACTOR_SRC_ALPHA: - return MALI_BLEND_OPERAND_C_SRC_ALPHA; + case BLEND_FACTOR_SRC_ALPHA: + return MALI_BLEND_OPERAND_C_SRC_ALPHA; - case BLEND_FACTOR_DST_ALPHA: - return MALI_BLEND_OPERAND_C_DEST_ALPHA; + case BLEND_FACTOR_DST_ALPHA: + return MALI_BLEND_OPERAND_C_DEST_ALPHA; - case BLEND_FACTOR_SRC_COLOR: - return MALI_BLEND_OPERAND_C_SRC; + case BLEND_FACTOR_SRC_COLOR: + return MALI_BLEND_OPERAND_C_SRC; - case BLEND_FACTOR_DST_COLOR: - return MALI_BLEND_OPERAND_C_DEST; + case BLEND_FACTOR_DST_COLOR: + return MALI_BLEND_OPERAND_C_DEST; - case BLEND_FACTOR_CONSTANT_COLOR: - case BLEND_FACTOR_CONSTANT_ALPHA: - return MALI_BLEND_OPERAND_C_CONSTANT; + case BLEND_FACTOR_CONSTANT_COLOR: + case BLEND_FACTOR_CONSTANT_ALPHA: + return MALI_BLEND_OPERAND_C_CONSTANT; - default: - unreachable("Unsupported blend factor"); - } + default: + unreachable("Unsupported blend factor"); + } } static void -to_panfrost_function(enum blend_func blend_func, - enum blend_factor src_factor, - bool invert_src, - enum blend_factor dest_factor, - bool invert_dest, - bool is_alpha, +to_panfrost_function(enum blend_func blend_func, enum blend_factor src_factor, + bool invert_src, enum blend_factor dest_factor, + bool invert_dest, bool is_alpha, struct MALI_BLEND_FUNCTION *function) { - assert(can_fixed_function_equation(blend_func, src_factor, invert_src, - dest_factor, invert_dest, is_alpha, true)); + assert(can_fixed_function_equation(blend_func, src_factor, invert_src, + dest_factor, invert_dest, is_alpha, + true)); - if (src_factor == BLEND_FACTOR_ZERO && !invert_src) { - function->a = MALI_BLEND_OPERAND_A_ZERO; - function->b = MALI_BLEND_OPERAND_B_DEST; - if (blend_func == BLEND_FUNC_SUBTRACT) - function->negate_b = true; - function->invert_c = invert_dest; - function->c = to_c_factor(dest_factor); - } else if (src_factor == BLEND_FACTOR_ZERO && invert_src) { - function->a = MALI_BLEND_OPERAND_A_SRC; - function->b = MALI_BLEND_OPERAND_B_DEST; - if (blend_func == BLEND_FUNC_SUBTRACT) - function->negate_b = true; - else if (blend_func == BLEND_FUNC_REVERSE_SUBTRACT) - function->negate_a = true; - function->invert_c = invert_dest; - function->c = to_c_factor(dest_factor); - } else if (dest_factor == BLEND_FACTOR_ZERO && !invert_dest) { - function->a = MALI_BLEND_OPERAND_A_ZERO; - function->b = MALI_BLEND_OPERAND_B_SRC; - if (blend_func == BLEND_FUNC_REVERSE_SUBTRACT) - function->negate_b = true; - function->invert_c = invert_src; - function->c = to_c_factor(src_factor); - } else if (dest_factor == BLEND_FACTOR_ZERO && invert_dest) { - function->a = MALI_BLEND_OPERAND_A_DEST; - function->b = MALI_BLEND_OPERAND_B_SRC; - if (blend_func == BLEND_FUNC_SUBTRACT) - function->negate_a = true; - else if (blend_func == BLEND_FUNC_REVERSE_SUBTRACT) - function->negate_b = true; - function->invert_c = invert_src; - function->c = to_c_factor(src_factor); - } else if (src_factor == dest_factor && invert_src == invert_dest) { - function->a = MALI_BLEND_OPERAND_A_ZERO; - function->invert_c = invert_src; - function->c = to_c_factor(src_factor); + if (src_factor == BLEND_FACTOR_ZERO && !invert_src) { + function->a = MALI_BLEND_OPERAND_A_ZERO; + function->b = MALI_BLEND_OPERAND_B_DEST; + if (blend_func == BLEND_FUNC_SUBTRACT) + function->negate_b = true; + function->invert_c = invert_dest; + function->c = to_c_factor(dest_factor); + } else if (src_factor == BLEND_FACTOR_ZERO && invert_src) { + function->a = MALI_BLEND_OPERAND_A_SRC; + function->b = MALI_BLEND_OPERAND_B_DEST; + if (blend_func == BLEND_FUNC_SUBTRACT) + function->negate_b = true; + else if (blend_func == BLEND_FUNC_REVERSE_SUBTRACT) + function->negate_a = true; + function->invert_c = invert_dest; + function->c = to_c_factor(dest_factor); + } else if (dest_factor == BLEND_FACTOR_ZERO && !invert_dest) { + function->a = MALI_BLEND_OPERAND_A_ZERO; + function->b = MALI_BLEND_OPERAND_B_SRC; + if (blend_func == BLEND_FUNC_REVERSE_SUBTRACT) + function->negate_b = true; + function->invert_c = invert_src; + function->c = to_c_factor(src_factor); + } else if (dest_factor == BLEND_FACTOR_ZERO && invert_dest) { + function->a = MALI_BLEND_OPERAND_A_DEST; + function->b = MALI_BLEND_OPERAND_B_SRC; + if (blend_func == BLEND_FUNC_SUBTRACT) + function->negate_a = true; + else if (blend_func == BLEND_FUNC_REVERSE_SUBTRACT) + function->negate_b = true; + function->invert_c = invert_src; + function->c = to_c_factor(src_factor); + } else if (src_factor == dest_factor && invert_src == invert_dest) { + function->a = MALI_BLEND_OPERAND_A_ZERO; + function->invert_c = invert_src; + function->c = to_c_factor(src_factor); - switch (blend_func) { - case BLEND_FUNC_ADD: - function->b = MALI_BLEND_OPERAND_B_SRC_PLUS_DEST; - break; - case BLEND_FUNC_REVERSE_SUBTRACT: - function->negate_b = true; - FALLTHROUGH; - case BLEND_FUNC_SUBTRACT: - function->b = MALI_BLEND_OPERAND_B_SRC_MINUS_DEST; - break; - default: - unreachable("Invalid blend function"); - } - } else if (is_2srcdest(blend_func, src_factor, invert_src, dest_factor, - invert_dest, is_alpha)) { - /* src*dest + dest*src = 2*src*dest = 0 + dest*(2*src) */ - function->a = MALI_BLEND_OPERAND_A_ZERO; - function->b = MALI_BLEND_OPERAND_B_DEST; - function->c = MALI_BLEND_OPERAND_C_SRC_X_2; - } else { - assert(src_factor == dest_factor && invert_src != invert_dest); + switch (blend_func) { + case BLEND_FUNC_ADD: + function->b = MALI_BLEND_OPERAND_B_SRC_PLUS_DEST; + break; + case BLEND_FUNC_REVERSE_SUBTRACT: + function->negate_b = true; + FALLTHROUGH; + case BLEND_FUNC_SUBTRACT: + function->b = MALI_BLEND_OPERAND_B_SRC_MINUS_DEST; + break; + default: + unreachable("Invalid blend function"); + } + } else if (is_2srcdest(blend_func, src_factor, invert_src, dest_factor, + invert_dest, is_alpha)) { + /* src*dest + dest*src = 2*src*dest = 0 + dest*(2*src) */ + function->a = MALI_BLEND_OPERAND_A_ZERO; + function->b = MALI_BLEND_OPERAND_B_DEST; + function->c = MALI_BLEND_OPERAND_C_SRC_X_2; + } else { + assert(src_factor == dest_factor && invert_src != invert_dest); - function->a = MALI_BLEND_OPERAND_A_DEST; - function->invert_c = invert_src; - function->c = to_c_factor(src_factor); + function->a = MALI_BLEND_OPERAND_A_DEST; + function->invert_c = invert_src; + function->c = to_c_factor(src_factor); - switch (blend_func) { - case BLEND_FUNC_ADD: - function->b = MALI_BLEND_OPERAND_B_SRC_MINUS_DEST; - break; - case BLEND_FUNC_REVERSE_SUBTRACT: - function->b = MALI_BLEND_OPERAND_B_SRC_PLUS_DEST; - function->negate_b = true; - break; - case BLEND_FUNC_SUBTRACT: - function->b = MALI_BLEND_OPERAND_B_SRC_PLUS_DEST; - function->negate_a = true; - break; - default: - unreachable("Invalid blend function\n"); - } - } + switch (blend_func) { + case BLEND_FUNC_ADD: + function->b = MALI_BLEND_OPERAND_B_SRC_MINUS_DEST; + break; + case BLEND_FUNC_REVERSE_SUBTRACT: + function->b = MALI_BLEND_OPERAND_B_SRC_PLUS_DEST; + function->negate_b = true; + break; + case BLEND_FUNC_SUBTRACT: + function->b = MALI_BLEND_OPERAND_B_SRC_PLUS_DEST; + function->negate_a = true; + break; + default: + unreachable("Invalid blend function\n"); + } + } } bool pan_blend_is_opaque(const struct pan_blend_equation equation) { - /* If a channel is masked out, we can't use opaque mode even if - * blending is disabled, since we need a tilebuffer read in there */ - if (equation.color_mask != 0xF) - return false; + /* If a channel is masked out, we can't use opaque mode even if + * blending is disabled, since we need a tilebuffer read in there */ + if (equation.color_mask != 0xF) + return false; - /* With nothing masked out, disabled bledning is opaque */ - if (!equation.blend_enable) - return true; + /* With nothing masked out, disabled bledning is opaque */ + if (!equation.blend_enable) + return true; - /* Also detect open-coded opaque blending */ - return equation.rgb_src_factor == BLEND_FACTOR_ZERO && - equation.rgb_invert_src_factor && - equation.rgb_dst_factor == BLEND_FACTOR_ZERO && - !equation.rgb_invert_dst_factor && - (equation.rgb_func == BLEND_FUNC_ADD || - equation.rgb_func == BLEND_FUNC_SUBTRACT) && - equation.alpha_src_factor == BLEND_FACTOR_ZERO && - equation.alpha_invert_src_factor && - equation.alpha_dst_factor == BLEND_FACTOR_ZERO && - !equation.alpha_invert_dst_factor && - (equation.alpha_func == BLEND_FUNC_ADD || - equation.alpha_func == BLEND_FUNC_SUBTRACT); + /* Also detect open-coded opaque blending */ + return equation.rgb_src_factor == BLEND_FACTOR_ZERO && + equation.rgb_invert_src_factor && + equation.rgb_dst_factor == BLEND_FACTOR_ZERO && + !equation.rgb_invert_dst_factor && + (equation.rgb_func == BLEND_FUNC_ADD || + equation.rgb_func == BLEND_FUNC_SUBTRACT) && + equation.alpha_src_factor == BLEND_FACTOR_ZERO && + equation.alpha_invert_src_factor && + equation.alpha_dst_factor == BLEND_FACTOR_ZERO && + !equation.alpha_invert_dst_factor && + (equation.alpha_func == BLEND_FUNC_ADD || + equation.alpha_func == BLEND_FUNC_SUBTRACT); } /* Check if (factor, invert) represents a constant value of val, assuming @@ -313,11 +298,11 @@ pan_blend_is_opaque(const struct pan_blend_equation equation) static inline bool is_factor_01(unsigned factor, bool invert, unsigned val, unsigned srca) { - assert(val == 0 || val == 1); - assert(srca == 0 || srca == 1); + assert(val == 0 || val == 1); + assert(srca == 0 || srca == 1); - return ((invert ^ !val) && factor == BLEND_FACTOR_ZERO) || - ((invert ^ srca ^ !val) && factor == BLEND_FACTOR_SRC_ALPHA); + return ((invert ^ !val) && factor == BLEND_FACTOR_ZERO) || + ((invert ^ srca ^ !val) && factor == BLEND_FACTOR_SRC_ALPHA); } /* Returns if src alpha = 0 implies the blended colour equals the destination @@ -340,24 +325,24 @@ is_factor_01(unsigned factor, bool invert, unsigned val, unsigned srca) bool pan_blend_alpha_zero_nop(const struct pan_blend_equation eq) { - if (eq.rgb_func != BLEND_FUNC_ADD && - eq.rgb_func != BLEND_FUNC_REVERSE_SUBTRACT) - return false; + if (eq.rgb_func != BLEND_FUNC_ADD && + eq.rgb_func != BLEND_FUNC_REVERSE_SUBTRACT) + return false; - if (eq.color_mask & 0x8) { - if (!is_factor_01(eq.alpha_dst_factor, eq.alpha_invert_dst_factor, 1, 0)) - return false; - } + if (eq.color_mask & 0x8) { + if (!is_factor_01(eq.alpha_dst_factor, eq.alpha_invert_dst_factor, 1, 0)) + return false; + } - if (eq.color_mask & 0x7) { - if (!is_factor_01(eq.rgb_dst_factor, eq.rgb_invert_dst_factor, 1, 0)) - return false; + if (eq.color_mask & 0x7) { + if (!is_factor_01(eq.rgb_dst_factor, eq.rgb_invert_dst_factor, 1, 0)) + return false; - if (!is_factor_01(eq.rgb_src_factor, eq.rgb_invert_src_factor, 0, 0)) - return false; - } + if (!is_factor_01(eq.rgb_src_factor, eq.rgb_invert_src_factor, 0, 0)) + return false; + } - return true; + return true; } /* Returns if src alpha = 1 implies the blended colour equals the source @@ -378,25 +363,24 @@ pan_blend_alpha_zero_nop(const struct pan_blend_equation eq) bool pan_blend_alpha_one_store(const struct pan_blend_equation eq) { - if (eq.rgb_func != BLEND_FUNC_ADD && - eq.rgb_func != BLEND_FUNC_SUBTRACT) - return false; + if (eq.rgb_func != BLEND_FUNC_ADD && eq.rgb_func != BLEND_FUNC_SUBTRACT) + return false; - if (eq.color_mask != 0xf) - return false; + if (eq.color_mask != 0xf) + return false; - return is_factor_01(eq.rgb_src_factor, eq.rgb_invert_src_factor, 1, 1) && - is_factor_01(eq.alpha_src_factor, eq.alpha_invert_src_factor, 1, 1) && - is_factor_01(eq.rgb_dst_factor, eq.rgb_invert_dst_factor, 0, 1) && - is_factor_01(eq.alpha_dst_factor, eq.alpha_invert_dst_factor, 0, 1); + return is_factor_01(eq.rgb_src_factor, eq.rgb_invert_src_factor, 1, 1) && + is_factor_01(eq.alpha_src_factor, eq.alpha_invert_src_factor, 1, 1) && + is_factor_01(eq.rgb_dst_factor, eq.rgb_invert_dst_factor, 0, 1) && + is_factor_01(eq.alpha_dst_factor, eq.alpha_invert_dst_factor, 0, 1); } static bool is_dest_factor(enum blend_factor factor, bool alpha) { - return factor == BLEND_FACTOR_DST_ALPHA || - factor == BLEND_FACTOR_DST_COLOR || - (factor == BLEND_FACTOR_SRC_ALPHA_SATURATE && !alpha); + return factor == BLEND_FACTOR_DST_ALPHA || + factor == BLEND_FACTOR_DST_COLOR || + (factor == BLEND_FACTOR_SRC_ALPHA_SATURATE && !alpha); } /* Determines if a blend equation reads back the destination. This can occur by @@ -406,13 +390,13 @@ is_dest_factor(enum blend_factor factor, bool alpha) bool pan_blend_reads_dest(const struct pan_blend_equation equation) { - return (equation.color_mask && equation.color_mask != 0xF) || - is_dest_factor(equation.rgb_src_factor, false) || - is_dest_factor(equation.alpha_src_factor, true) || - equation.rgb_dst_factor != BLEND_FACTOR_ZERO || - equation.rgb_invert_dst_factor || - equation.alpha_dst_factor != BLEND_FACTOR_ZERO || - equation.alpha_invert_dst_factor; + return (equation.color_mask && equation.color_mask != 0xF) || + is_dest_factor(equation.rgb_src_factor, false) || + is_dest_factor(equation.alpha_src_factor, true) || + equation.rgb_dst_factor != BLEND_FACTOR_ZERO || + equation.rgb_invert_dst_factor || + equation.alpha_dst_factor != BLEND_FACTOR_ZERO || + equation.alpha_invert_dst_factor; } /* Create the descriptor for a fixed blend mode given the corresponding API @@ -422,72 +406,68 @@ void pan_blend_to_fixed_function_equation(const struct pan_blend_equation equation, struct MALI_BLEND_EQUATION *out) { - /* If no blending is enabled, default back on `replace` mode */ - if (!equation.blend_enable) { - out->color_mask = equation.color_mask; - out->rgb.a = MALI_BLEND_OPERAND_A_SRC; - out->rgb.b = MALI_BLEND_OPERAND_B_SRC; - out->rgb.c = MALI_BLEND_OPERAND_C_ZERO; - out->alpha.a = MALI_BLEND_OPERAND_A_SRC; - out->alpha.b = MALI_BLEND_OPERAND_B_SRC; - out->alpha.c = MALI_BLEND_OPERAND_C_ZERO; - return; - } + /* If no blending is enabled, default back on `replace` mode */ + if (!equation.blend_enable) { + out->color_mask = equation.color_mask; + out->rgb.a = MALI_BLEND_OPERAND_A_SRC; + out->rgb.b = MALI_BLEND_OPERAND_B_SRC; + out->rgb.c = MALI_BLEND_OPERAND_C_ZERO; + out->alpha.a = MALI_BLEND_OPERAND_A_SRC; + out->alpha.b = MALI_BLEND_OPERAND_B_SRC; + out->alpha.c = MALI_BLEND_OPERAND_C_ZERO; + return; + } - /* Compile the fixed-function blend */ - to_panfrost_function(equation.rgb_func, - equation.rgb_src_factor, - equation.rgb_invert_src_factor, - equation.rgb_dst_factor, - equation.rgb_invert_dst_factor, - false, &out->rgb); + /* Compile the fixed-function blend */ + to_panfrost_function(equation.rgb_func, equation.rgb_src_factor, + equation.rgb_invert_src_factor, equation.rgb_dst_factor, + equation.rgb_invert_dst_factor, false, &out->rgb); - to_panfrost_function(equation.alpha_func, - equation.alpha_src_factor, - equation.alpha_invert_src_factor, - equation.alpha_dst_factor, - equation.alpha_invert_dst_factor, - true, &out->alpha); - out->color_mask = equation.color_mask; + to_panfrost_function(equation.alpha_func, equation.alpha_src_factor, + equation.alpha_invert_src_factor, + equation.alpha_dst_factor, + equation.alpha_invert_dst_factor, true, &out->alpha); + out->color_mask = equation.color_mask; } uint32_t pan_pack_blend(const struct pan_blend_equation equation) { - STATIC_ASSERT(sizeof(uint32_t) == MALI_BLEND_EQUATION_LENGTH); + STATIC_ASSERT(sizeof(uint32_t) == MALI_BLEND_EQUATION_LENGTH); - uint32_t out = 0; + uint32_t out = 0; - pan_pack(&out, BLEND_EQUATION, cfg) { - pan_blend_to_fixed_function_equation(equation, &cfg); - } + pan_pack(&out, BLEND_EQUATION, cfg) { + pan_blend_to_fixed_function_equation(equation, &cfg); + } - return out; + return out; } -static uint32_t pan_blend_shader_key_hash(const void *key) +static uint32_t +pan_blend_shader_key_hash(const void *key) { - return _mesa_hash_data(key, sizeof(struct pan_blend_shader_key)); + return _mesa_hash_data(key, sizeof(struct pan_blend_shader_key)); } -static bool pan_blend_shader_key_equal(const void *a, const void *b) +static bool +pan_blend_shader_key_equal(const void *a, const void *b) { - return !memcmp(a, b, sizeof(struct pan_blend_shader_key)); + return !memcmp(a, b, sizeof(struct pan_blend_shader_key)); } void pan_blend_shaders_init(struct panfrost_device *dev) { - dev->blend_shaders.shaders = - _mesa_hash_table_create(NULL, pan_blend_shader_key_hash, - pan_blend_shader_key_equal); - pthread_mutex_init(&dev->blend_shaders.lock, NULL); + dev->blend_shaders.shaders = _mesa_hash_table_create( + NULL, pan_blend_shader_key_hash, pan_blend_shader_key_equal); + pthread_mutex_init(&dev->blend_shaders.lock, NULL); } void pan_blend_shaders_cleanup(struct panfrost_device *dev) { - _mesa_hash_table_destroy(dev->blend_shaders.shaders, NULL); + _mesa_hash_table_destroy(dev->blend_shaders.shaders, NULL); } #else /* ifndef PAN_ARCH */ @@ -495,231 +475,248 @@ pan_blend_shaders_cleanup(struct panfrost_device *dev) static const char * logicop_str(enum pipe_logicop logicop) { - switch (logicop) { - case PIPE_LOGICOP_CLEAR: return "clear"; - case PIPE_LOGICOP_NOR: return "nor"; - case PIPE_LOGICOP_AND_INVERTED: return "and-inverted"; - case PIPE_LOGICOP_COPY_INVERTED: return "copy-inverted"; - case PIPE_LOGICOP_AND_REVERSE: return "and-reverse"; - case PIPE_LOGICOP_INVERT: return "invert"; - case PIPE_LOGICOP_XOR: return "xor"; - case PIPE_LOGICOP_NAND: return "nand"; - case PIPE_LOGICOP_AND: return "and"; - case PIPE_LOGICOP_EQUIV: return "equiv"; - case PIPE_LOGICOP_NOOP: return "noop"; - case PIPE_LOGICOP_OR_INVERTED: return "or-inverted"; - case PIPE_LOGICOP_COPY: return "copy"; - case PIPE_LOGICOP_OR_REVERSE: return "or-reverse"; - case PIPE_LOGICOP_OR: return "or"; - case PIPE_LOGICOP_SET: return "set"; - default: unreachable("Invalid logicop\n"); - } + switch (logicop) { + case PIPE_LOGICOP_CLEAR: + return "clear"; + case PIPE_LOGICOP_NOR: + return "nor"; + case PIPE_LOGICOP_AND_INVERTED: + return "and-inverted"; + case PIPE_LOGICOP_COPY_INVERTED: + return "copy-inverted"; + case PIPE_LOGICOP_AND_REVERSE: + return "and-reverse"; + case PIPE_LOGICOP_INVERT: + return "invert"; + case PIPE_LOGICOP_XOR: + return "xor"; + case PIPE_LOGICOP_NAND: + return "nand"; + case PIPE_LOGICOP_AND: + return "and"; + case PIPE_LOGICOP_EQUIV: + return "equiv"; + case PIPE_LOGICOP_NOOP: + return "noop"; + case PIPE_LOGICOP_OR_INVERTED: + return "or-inverted"; + case PIPE_LOGICOP_COPY: + return "copy"; + case PIPE_LOGICOP_OR_REVERSE: + return "or-reverse"; + case PIPE_LOGICOP_OR: + return "or"; + case PIPE_LOGICOP_SET: + return "set"; + default: + unreachable("Invalid logicop\n"); + } } static void -get_equation_str(const struct pan_blend_rt_state *rt_state, - char *str, unsigned len) +get_equation_str(const struct pan_blend_rt_state *rt_state, char *str, + unsigned len) { - const char *funcs[] = { - "add", "sub", "reverse_sub", "min", "max", - }; - const char *factors[] = { - "zero", "src_color", "src1_color", "dst_color", - "src_alpha", "src1_alpha", "dst_alpha", - "const_color", "const_alpha", "src_alpha_sat", - }; - int ret; + const char *funcs[] = { + "add", "sub", "reverse_sub", "min", "max", + }; + const char *factors[] = { + "zero", "src_color", "src1_color", "dst_color", "src_alpha", + "src1_alpha", "dst_alpha", "const_color", "const_alpha", "src_alpha_sat", + }; + int ret; - if (!rt_state->equation.blend_enable) { - ret = snprintf(str, len, "replace(%s%s%s%s)", - (rt_state->equation.color_mask & 1) ? "R" : "", - (rt_state->equation.color_mask & 2) ? "G" : "", - (rt_state->equation.color_mask & 4) ? "B" : "", - (rt_state->equation.color_mask & 8) ? "A" : ""); - assert(ret > 0); - return; - } + if (!rt_state->equation.blend_enable) { + ret = snprintf(str, len, "replace(%s%s%s%s)", + (rt_state->equation.color_mask & 1) ? "R" : "", + (rt_state->equation.color_mask & 2) ? "G" : "", + (rt_state->equation.color_mask & 4) ? "B" : "", + (rt_state->equation.color_mask & 8) ? "A" : ""); + assert(ret > 0); + return; + } - if (rt_state->equation.color_mask & 7) { - assert(rt_state->equation.rgb_func < ARRAY_SIZE(funcs)); - assert(rt_state->equation.rgb_src_factor < ARRAY_SIZE(factors)); - assert(rt_state->equation.rgb_dst_factor < ARRAY_SIZE(factors)); - ret = snprintf(str, len, "%s%s%s(func=%s,src_factor=%s%s,dst_factor=%s%s)%s", - (rt_state->equation.color_mask & 1) ? "R" : "", - (rt_state->equation.color_mask & 2) ? "G" : "", - (rt_state->equation.color_mask & 4) ? "B" : "", - funcs[rt_state->equation.rgb_func], - rt_state->equation.rgb_invert_src_factor ? "-" : "", - factors[rt_state->equation.rgb_src_factor], - rt_state->equation.rgb_invert_dst_factor ? "-" : "", - factors[rt_state->equation.rgb_dst_factor], - rt_state->equation.color_mask & 8 ? ";" : ""); - assert(ret > 0); - str += ret; - len -= ret; - } + if (rt_state->equation.color_mask & 7) { + assert(rt_state->equation.rgb_func < ARRAY_SIZE(funcs)); + assert(rt_state->equation.rgb_src_factor < ARRAY_SIZE(factors)); + assert(rt_state->equation.rgb_dst_factor < ARRAY_SIZE(factors)); + ret = + snprintf(str, len, "%s%s%s(func=%s,src_factor=%s%s,dst_factor=%s%s)%s", + (rt_state->equation.color_mask & 1) ? "R" : "", + (rt_state->equation.color_mask & 2) ? "G" : "", + (rt_state->equation.color_mask & 4) ? "B" : "", + funcs[rt_state->equation.rgb_func], + rt_state->equation.rgb_invert_src_factor ? "-" : "", + factors[rt_state->equation.rgb_src_factor], + rt_state->equation.rgb_invert_dst_factor ? "-" : "", + factors[rt_state->equation.rgb_dst_factor], + rt_state->equation.color_mask & 8 ? ";" : ""); + assert(ret > 0); + str += ret; + len -= ret; + } - if (rt_state->equation.color_mask & 8) { - assert(rt_state->equation.alpha_func < ARRAY_SIZE(funcs)); - assert(rt_state->equation.alpha_src_factor < ARRAY_SIZE(factors)); - assert(rt_state->equation.alpha_dst_factor < ARRAY_SIZE(factors)); - ret = snprintf(str, len, "A(func=%s,src_factor=%s%s,dst_factor=%s%s)", - funcs[rt_state->equation.alpha_func], - rt_state->equation.alpha_invert_src_factor ? "-" : "", - factors[rt_state->equation.alpha_src_factor], - rt_state->equation.alpha_invert_dst_factor ? "-" : "", - factors[rt_state->equation.alpha_dst_factor]); - assert(ret > 0); - str += ret; - len -= ret; - } + if (rt_state->equation.color_mask & 8) { + assert(rt_state->equation.alpha_func < ARRAY_SIZE(funcs)); + assert(rt_state->equation.alpha_src_factor < ARRAY_SIZE(factors)); + assert(rt_state->equation.alpha_dst_factor < ARRAY_SIZE(factors)); + ret = snprintf(str, len, "A(func=%s,src_factor=%s%s,dst_factor=%s%s)", + funcs[rt_state->equation.alpha_func], + rt_state->equation.alpha_invert_src_factor ? "-" : "", + factors[rt_state->equation.alpha_src_factor], + rt_state->equation.alpha_invert_dst_factor ? "-" : "", + factors[rt_state->equation.alpha_dst_factor]); + assert(ret > 0); + str += ret; + len -= ret; + } } static bool pan_inline_blend_constants(nir_builder *b, nir_instr *instr, void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_load_blend_const_color_rgba) - return false; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_load_blend_const_color_rgba) + return false; - float *floats = data; - const nir_const_value constants[4] = { - nir_const_value_for_float(floats[0], 32), - nir_const_value_for_float(floats[1], 32), - nir_const_value_for_float(floats[2], 32), - nir_const_value_for_float(floats[3], 32) - }; + float *floats = data; + const nir_const_value constants[4] = { + nir_const_value_for_float(floats[0], 32), + nir_const_value_for_float(floats[1], 32), + nir_const_value_for_float(floats[2], 32), + nir_const_value_for_float(floats[3], 32)}; - b->cursor = nir_after_instr(instr); - nir_ssa_def *constant = nir_build_imm(b, 4, 32, constants); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, constant); - nir_instr_remove(instr); - return true; + b->cursor = nir_after_instr(instr); + nir_ssa_def *constant = nir_build_imm(b, 4, 32, constants); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, constant); + nir_instr_remove(instr); + return true; } nir_shader * GENX(pan_blend_create_shader)(const struct panfrost_device *dev, const struct pan_blend_state *state, - nir_alu_type src0_type, - nir_alu_type src1_type, + nir_alu_type src0_type, nir_alu_type src1_type, unsigned rt) { - const struct pan_blend_rt_state *rt_state = &state->rts[rt]; - char equation_str[128] = { 0 }; + const struct pan_blend_rt_state *rt_state = &state->rts[rt]; + char equation_str[128] = {0}; - get_equation_str(rt_state, equation_str, sizeof(equation_str)); + get_equation_str(rt_state, equation_str, sizeof(equation_str)); - nir_builder b = - nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, - GENX(pan_shader_get_compiler_options)(), - "pan_blend(rt=%d,fmt=%s,nr_samples=%d,%s=%s)", - rt, util_format_name(rt_state->format), - rt_state->nr_samples, - state->logicop_enable ? "logicop" : "equation", - state->logicop_enable ? - logicop_str(state->logicop_func) : equation_str); + nir_builder b = nir_builder_init_simple_shader( + MESA_SHADER_FRAGMENT, GENX(pan_shader_get_compiler_options)(), + "pan_blend(rt=%d,fmt=%s,nr_samples=%d,%s=%s)", rt, + util_format_name(rt_state->format), rt_state->nr_samples, + state->logicop_enable ? "logicop" : "equation", + state->logicop_enable ? logicop_str(state->logicop_func) : equation_str); - const struct util_format_description *format_desc = - util_format_description(rt_state->format); - nir_alu_type nir_type = pan_unpacked_type_for_format(format_desc); + const struct util_format_description *format_desc = + util_format_description(rt_state->format); + nir_alu_type nir_type = pan_unpacked_type_for_format(format_desc); - /* Bifrost/Valhall support 16-bit and 32-bit register formats for - * LD_TILE/ST_TILE/BLEND, but do not support 8-bit. Rather than making - * the fragment output 8-bit and inserting extra conversions in the - * compiler, promote the output to 16-bit. The larger size is still - * compatible with correct conversion semantics. - */ - if (PAN_ARCH >= 6 && nir_alu_type_get_type_size(nir_type) == 8) - nir_type = nir_alu_type_get_base_type(nir_type) | 16; + /* Bifrost/Valhall support 16-bit and 32-bit register formats for + * LD_TILE/ST_TILE/BLEND, but do not support 8-bit. Rather than making + * the fragment output 8-bit and inserting extra conversions in the + * compiler, promote the output to 16-bit. The larger size is still + * compatible with correct conversion semantics. + */ + if (PAN_ARCH >= 6 && nir_alu_type_get_type_size(nir_type) == 8) + nir_type = nir_alu_type_get_base_type(nir_type) | 16; - enum glsl_base_type glsl_type = nir_get_glsl_base_type_for_nir_type(nir_type); + enum glsl_base_type glsl_type = + nir_get_glsl_base_type_for_nir_type(nir_type); - nir_lower_blend_options options = { - .logicop_enable = state->logicop_enable, - .logicop_func = state->logicop_func, - .rt[0].colormask = rt_state->equation.color_mask, - .format[0] = rt_state->format, - }; + nir_lower_blend_options options = { + .logicop_enable = state->logicop_enable, + .logicop_func = state->logicop_func, + .rt[0].colormask = rt_state->equation.color_mask, + .format[0] = rt_state->format, + }; - if (!rt_state->equation.blend_enable) { - static const nir_lower_blend_channel replace = { - .func = BLEND_FUNC_ADD, - .src_factor = BLEND_FACTOR_ZERO, - .invert_src_factor = true, - .dst_factor = BLEND_FACTOR_ZERO, - .invert_dst_factor = false, - }; + if (!rt_state->equation.blend_enable) { + static const nir_lower_blend_channel replace = { + .func = BLEND_FUNC_ADD, + .src_factor = BLEND_FACTOR_ZERO, + .invert_src_factor = true, + .dst_factor = BLEND_FACTOR_ZERO, + .invert_dst_factor = false, + }; - options.rt[0].rgb = replace; - options.rt[0].alpha = replace; - } else { - options.rt[0].rgb.func = rt_state->equation.rgb_func; - options.rt[0].rgb.src_factor = rt_state->equation.rgb_src_factor; - options.rt[0].rgb.invert_src_factor = rt_state->equation.rgb_invert_src_factor; - options.rt[0].rgb.dst_factor = rt_state->equation.rgb_dst_factor; - options.rt[0].rgb.invert_dst_factor = rt_state->equation.rgb_invert_dst_factor; - options.rt[0].alpha.func = rt_state->equation.alpha_func; - options.rt[0].alpha.src_factor = rt_state->equation.alpha_src_factor; - options.rt[0].alpha.invert_src_factor = rt_state->equation.alpha_invert_src_factor; - options.rt[0].alpha.dst_factor = rt_state->equation.alpha_dst_factor; - options.rt[0].alpha.invert_dst_factor = rt_state->equation.alpha_invert_dst_factor; - } + options.rt[0].rgb = replace; + options.rt[0].alpha = replace; + } else { + options.rt[0].rgb.func = rt_state->equation.rgb_func; + options.rt[0].rgb.src_factor = rt_state->equation.rgb_src_factor; + options.rt[0].rgb.invert_src_factor = + rt_state->equation.rgb_invert_src_factor; + options.rt[0].rgb.dst_factor = rt_state->equation.rgb_dst_factor; + options.rt[0].rgb.invert_dst_factor = + rt_state->equation.rgb_invert_dst_factor; + options.rt[0].alpha.func = rt_state->equation.alpha_func; + options.rt[0].alpha.src_factor = rt_state->equation.alpha_src_factor; + options.rt[0].alpha.invert_src_factor = + rt_state->equation.alpha_invert_src_factor; + options.rt[0].alpha.dst_factor = rt_state->equation.alpha_dst_factor; + options.rt[0].alpha.invert_dst_factor = + rt_state->equation.alpha_invert_dst_factor; + } - nir_alu_type src_types[] = { src0_type ?: nir_type_float32, src1_type ?: nir_type_float32 }; + nir_alu_type src_types[] = {src0_type ?: nir_type_float32, + src1_type ?: nir_type_float32}; - /* HACK: workaround buggy TGSI shaders (u_blitter) */ - for (unsigned i = 0; i < ARRAY_SIZE(src_types); ++i) { - src_types[i] = nir_alu_type_get_base_type(nir_type) | - nir_alu_type_get_type_size(src_types[i]); - } + /* HACK: workaround buggy TGSI shaders (u_blitter) */ + for (unsigned i = 0; i < ARRAY_SIZE(src_types); ++i) { + src_types[i] = nir_alu_type_get_base_type(nir_type) | + nir_alu_type_get_type_size(src_types[i]); + } - nir_variable *c_src = - nir_variable_create(b.shader, nir_var_shader_in, - glsl_vector_type(nir_get_glsl_base_type_for_nir_type(src_types[0]), 4), - "gl_Color"); - c_src->data.location = VARYING_SLOT_COL0; - nir_variable *c_src1 = - nir_variable_create(b.shader, nir_var_shader_in, - glsl_vector_type(nir_get_glsl_base_type_for_nir_type(src_types[1]), 4), - "gl_Color1"); - c_src1->data.location = VARYING_SLOT_VAR0; - c_src1->data.driver_location = 1; - nir_variable *c_out = - nir_variable_create(b.shader, nir_var_shader_out, - glsl_vector_type(glsl_type, 4), - "gl_FragColor"); - c_out->data.location = FRAG_RESULT_DATA0; + nir_variable *c_src = nir_variable_create( + b.shader, nir_var_shader_in, + glsl_vector_type(nir_get_glsl_base_type_for_nir_type(src_types[0]), 4), + "gl_Color"); + c_src->data.location = VARYING_SLOT_COL0; + nir_variable *c_src1 = nir_variable_create( + b.shader, nir_var_shader_in, + glsl_vector_type(nir_get_glsl_base_type_for_nir_type(src_types[1]), 4), + "gl_Color1"); + c_src1->data.location = VARYING_SLOT_VAR0; + c_src1->data.driver_location = 1; + nir_variable *c_out = + nir_variable_create(b.shader, nir_var_shader_out, + glsl_vector_type(glsl_type, 4), "gl_FragColor"); + c_out->data.location = FRAG_RESULT_DATA0; - nir_ssa_def *s_src[] = {nir_load_var(&b, c_src), nir_load_var(&b, c_src1)}; + nir_ssa_def *s_src[] = {nir_load_var(&b, c_src), nir_load_var(&b, c_src1)}; - /* On Midgard, the blend shader is responsible for format conversion. - * As the OpenGL spec requires integer conversions to saturate, we must - * saturate ourselves here. On Bifrost and later, the conversion - * hardware handles this automatically. - */ - for (int i = 0; i < ARRAY_SIZE(s_src); ++i) { - nir_alu_type T = nir_alu_type_get_base_type(nir_type); - bool should_saturate = (PAN_ARCH <= 5) && (T != nir_type_float); - s_src[i] = nir_convert_with_rounding(&b, s_src[i], - src_types[i], nir_type, - nir_rounding_mode_undef, - should_saturate); - } + /* On Midgard, the blend shader is responsible for format conversion. + * As the OpenGL spec requires integer conversions to saturate, we must + * saturate ourselves here. On Bifrost and later, the conversion + * hardware handles this automatically. + */ + for (int i = 0; i < ARRAY_SIZE(s_src); ++i) { + nir_alu_type T = nir_alu_type_get_base_type(nir_type); + bool should_saturate = (PAN_ARCH <= 5) && (T != nir_type_float); + s_src[i] = + nir_convert_with_rounding(&b, s_src[i], src_types[i], nir_type, + nir_rounding_mode_undef, should_saturate); + } - /* Build a trivial blend shader */ - nir_store_var(&b, c_out, s_src[0], 0xFF); + /* Build a trivial blend shader */ + nir_store_var(&b, c_out, s_src[0], 0xFF); - options.src1 = s_src[1]; + options.src1 = s_src[1]; - NIR_PASS_V(b.shader, nir_lower_blend, &options); - nir_shader_instructions_pass(b.shader, pan_inline_blend_constants, - nir_metadata_block_index | nir_metadata_dominance, - (void *) state->constants); + NIR_PASS_V(b.shader, nir_lower_blend, &options); + nir_shader_instructions_pass( + b.shader, pan_inline_blend_constants, + nir_metadata_block_index | nir_metadata_dominance, + (void *)state->constants); - return b.shader; + return b.shader; } #if PAN_ARCH >= 6 @@ -728,55 +725,55 @@ GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev, enum pipe_format fmt, unsigned rt, unsigned force_size, bool dithered) { - const struct util_format_description *desc = util_format_description(fmt); - uint64_t res; + const struct util_format_description *desc = util_format_description(fmt); + uint64_t res; - pan_pack(&res, INTERNAL_BLEND, cfg) { - cfg.mode = MALI_BLEND_MODE_OPAQUE; - cfg.fixed_function.num_comps = desc->nr_channels; - cfg.fixed_function.rt = rt; + pan_pack(&res, INTERNAL_BLEND, cfg) { + cfg.mode = MALI_BLEND_MODE_OPAQUE; + cfg.fixed_function.num_comps = desc->nr_channels; + cfg.fixed_function.rt = rt; - nir_alu_type T = pan_unpacked_type_for_format(desc); + nir_alu_type T = pan_unpacked_type_for_format(desc); - if (force_size) - T = nir_alu_type_get_base_type(T) | force_size; + if (force_size) + T = nir_alu_type_get_base_type(T) | force_size; - switch (T) { - case nir_type_float16: - cfg.fixed_function.conversion.register_format = - MALI_REGISTER_FILE_FORMAT_F16; - break; - case nir_type_float32: - cfg.fixed_function.conversion.register_format = - MALI_REGISTER_FILE_FORMAT_F32; - break; - case nir_type_int8: - case nir_type_int16: - cfg.fixed_function.conversion.register_format = - MALI_REGISTER_FILE_FORMAT_I16; - break; - case nir_type_int32: - cfg.fixed_function.conversion.register_format = - MALI_REGISTER_FILE_FORMAT_I32; - break; - case nir_type_uint8: - case nir_type_uint16: - cfg.fixed_function.conversion.register_format = - MALI_REGISTER_FILE_FORMAT_U16; - break; - case nir_type_uint32: - cfg.fixed_function.conversion.register_format = - MALI_REGISTER_FILE_FORMAT_U32; - break; - default: - unreachable("Invalid format"); - } + switch (T) { + case nir_type_float16: + cfg.fixed_function.conversion.register_format = + MALI_REGISTER_FILE_FORMAT_F16; + break; + case nir_type_float32: + cfg.fixed_function.conversion.register_format = + MALI_REGISTER_FILE_FORMAT_F32; + break; + case nir_type_int8: + case nir_type_int16: + cfg.fixed_function.conversion.register_format = + MALI_REGISTER_FILE_FORMAT_I16; + break; + case nir_type_int32: + cfg.fixed_function.conversion.register_format = + MALI_REGISTER_FILE_FORMAT_I32; + break; + case nir_type_uint8: + case nir_type_uint16: + cfg.fixed_function.conversion.register_format = + MALI_REGISTER_FILE_FORMAT_U16; + break; + case nir_type_uint32: + cfg.fixed_function.conversion.register_format = + MALI_REGISTER_FILE_FORMAT_U32; + break; + default: + unreachable("Invalid format"); + } - cfg.fixed_function.conversion.memory_format = - panfrost_format_to_bifrost_blend(dev, fmt, dithered); - } + cfg.fixed_function.conversion.memory_format = + panfrost_format_to_bifrost_blend(dev, fmt, dithered); + } - return res; + return res; } #endif @@ -784,92 +781,93 @@ struct pan_blend_shader_variant * GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev, const struct pan_blend_state *state, nir_alu_type src0_type, - nir_alu_type src1_type, - unsigned rt) + nir_alu_type src1_type, unsigned rt) { - struct pan_blend_shader_key key = { - .format = state->rts[rt].format, - .src0_type = src0_type, - .src1_type = src1_type, - .rt = rt, - .has_constants = pan_blend_constant_mask(state->rts[rt].equation) != 0, - .logicop_enable = state->logicop_enable, - .logicop_func = state->logicop_func, - .nr_samples = state->rts[rt].nr_samples, - .equation = state->rts[rt].equation, - }; + struct pan_blend_shader_key key = { + .format = state->rts[rt].format, + .src0_type = src0_type, + .src1_type = src1_type, + .rt = rt, + .has_constants = pan_blend_constant_mask(state->rts[rt].equation) != 0, + .logicop_enable = state->logicop_enable, + .logicop_func = state->logicop_func, + .nr_samples = state->rts[rt].nr_samples, + .equation = state->rts[rt].equation, + }; - /* Blend shaders should only be used for blending on Bifrost onwards */ - assert(dev->arch <= 5 || !pan_blend_is_opaque(state->rts[rt].equation)); - assert(state->rts[rt].equation.color_mask != 0); + /* Blend shaders should only be used for blending on Bifrost onwards */ + assert(dev->arch <= 5 || !pan_blend_is_opaque(state->rts[rt].equation)); + assert(state->rts[rt].equation.color_mask != 0); - struct hash_entry *he = _mesa_hash_table_search(dev->blend_shaders.shaders, &key); - struct pan_blend_shader *shader = he ? he->data : NULL; + struct hash_entry *he = + _mesa_hash_table_search(dev->blend_shaders.shaders, &key); + struct pan_blend_shader *shader = he ? he->data : NULL; - if (!shader) { - shader = rzalloc(dev->blend_shaders.shaders, struct pan_blend_shader); - shader->key = key; - list_inithead(&shader->variants); - _mesa_hash_table_insert(dev->blend_shaders.shaders, &shader->key, shader); - } + if (!shader) { + shader = rzalloc(dev->blend_shaders.shaders, struct pan_blend_shader); + shader->key = key; + list_inithead(&shader->variants); + _mesa_hash_table_insert(dev->blend_shaders.shaders, &shader->key, shader); + } - list_for_each_entry(struct pan_blend_shader_variant, iter, - &shader->variants, node) { - if (!key.has_constants || - !memcmp(iter->constants, state->constants, sizeof(iter->constants))) { - return iter; - } - } + list_for_each_entry(struct pan_blend_shader_variant, iter, &shader->variants, + node) { + if (!key.has_constants || + !memcmp(iter->constants, state->constants, sizeof(iter->constants))) { + return iter; + } + } - struct pan_blend_shader_variant *variant = NULL; + struct pan_blend_shader_variant *variant = NULL; - if (shader->nvariants < PAN_BLEND_SHADER_MAX_VARIANTS) { - variant = rzalloc(shader, struct pan_blend_shader_variant); - util_dynarray_init(&variant->binary, variant); - list_add(&variant->node, &shader->variants); - shader->nvariants++; - } else { - variant = list_last_entry(&shader->variants, struct pan_blend_shader_variant, node); - list_del(&variant->node); - list_add(&variant->node, &shader->variants); - util_dynarray_clear(&variant->binary); - } + if (shader->nvariants < PAN_BLEND_SHADER_MAX_VARIANTS) { + variant = rzalloc(shader, struct pan_blend_shader_variant); + util_dynarray_init(&variant->binary, variant); + list_add(&variant->node, &shader->variants); + shader->nvariants++; + } else { + variant = list_last_entry(&shader->variants, + struct pan_blend_shader_variant, node); + list_del(&variant->node); + list_add(&variant->node, &shader->variants); + util_dynarray_clear(&variant->binary); + } - memcpy(variant->constants, state->constants, sizeof(variant->constants)); + memcpy(variant->constants, state->constants, sizeof(variant->constants)); - nir_shader *nir = - GENX(pan_blend_create_shader)(dev, state, src0_type, src1_type, rt); + nir_shader *nir = + GENX(pan_blend_create_shader)(dev, state, src0_type, src1_type, rt); - /* Compile the NIR shader */ - struct panfrost_compile_inputs inputs = { - .gpu_id = dev->gpu_id, - .is_blend = true, - .blend.rt = shader->key.rt, - .blend.nr_samples = key.nr_samples, - .fixed_sysval_ubo = -1, - .rt_formats = { key.format }, - }; + /* Compile the NIR shader */ + struct panfrost_compile_inputs inputs = { + .gpu_id = dev->gpu_id, + .is_blend = true, + .blend.rt = shader->key.rt, + .blend.nr_samples = key.nr_samples, + .fixed_sysval_ubo = -1, + .rt_formats = {key.format}, + }; #if PAN_ARCH >= 6 - inputs.blend.bifrost_blend_desc = - GENX(pan_blend_get_internal_desc)(dev, key.format, key.rt, 0, false); + inputs.blend.bifrost_blend_desc = + GENX(pan_blend_get_internal_desc)(dev, key.format, key.rt, 0, false); #endif - struct pan_shader_info info; + struct pan_shader_info info; - GENX(pan_shader_compile)(nir, &inputs, &variant->binary, &info); + GENX(pan_shader_compile)(nir, &inputs, &variant->binary, &info); - /* Blend shaders can't have sysvals */ - assert(info.sysvals.sysval_count == 0); + /* Blend shaders can't have sysvals */ + assert(info.sysvals.sysval_count == 0); - variant->work_reg_count = info.work_reg_count; + variant->work_reg_count = info.work_reg_count; #if PAN_ARCH <= 5 - variant->first_tag = info.midgard.first_tag; + variant->first_tag = info.midgard.first_tag; #endif - ralloc_free(nir); + ralloc_free(nir); - return variant; + return variant; } #endif /* ifndef PAN_ARCH */ diff --git a/src/panfrost/lib/pan_blend.h b/src/panfrost/lib/pan_blend.h index 2dec9525c4a..8b826d41b42 100644 --- a/src/panfrost/lib/pan_blend.h +++ b/src/panfrost/lib/pan_blend.h @@ -27,10 +27,10 @@ #include "genxml/gen_macros.h" -#include "util/u_dynarray.h" -#include "util/format/u_format.h" -#include "compiler/shader_enums.h" #include "compiler/nir/nir.h" +#include "compiler/shader_enums.h" +#include "util/format/u_format.h" +#include "util/u_dynarray.h" #include "panfrost/util/pan_ir.h" @@ -38,84 +38,78 @@ struct MALI_BLEND_EQUATION; struct panfrost_device; struct pan_blend_equation { - unsigned blend_enable : 1; - enum blend_func rgb_func : 3; - unsigned rgb_invert_src_factor : 1; - enum blend_factor rgb_src_factor : 4; - unsigned rgb_invert_dst_factor : 1; - enum blend_factor rgb_dst_factor : 4; - enum blend_func alpha_func : 3; - unsigned alpha_invert_src_factor : 1; - enum blend_factor alpha_src_factor : 4; - unsigned alpha_invert_dst_factor : 1; - enum blend_factor alpha_dst_factor : 4; - unsigned color_mask : 4; + unsigned blend_enable : 1; + enum blend_func rgb_func : 3; + unsigned rgb_invert_src_factor : 1; + enum blend_factor rgb_src_factor : 4; + unsigned rgb_invert_dst_factor : 1; + enum blend_factor rgb_dst_factor : 4; + enum blend_func alpha_func : 3; + unsigned alpha_invert_src_factor : 1; + enum blend_factor alpha_src_factor : 4; + unsigned alpha_invert_dst_factor : 1; + enum blend_factor alpha_dst_factor : 4; + unsigned color_mask : 4; }; struct pan_blend_rt_state { - /* RT format */ - enum pipe_format format; + /* RT format */ + enum pipe_format format; - /* Number of samples */ - unsigned nr_samples; + /* Number of samples */ + unsigned nr_samples; - struct pan_blend_equation equation; + struct pan_blend_equation equation; }; struct pan_blend_state { - bool logicop_enable; - enum pipe_logicop logicop_func; - float constants[4]; - unsigned rt_count; - struct pan_blend_rt_state rts[8]; + bool logicop_enable; + enum pipe_logicop logicop_func; + float constants[4]; + unsigned rt_count; + struct pan_blend_rt_state rts[8]; }; struct pan_blend_shader_key { - enum pipe_format format; - nir_alu_type src0_type, src1_type; - uint32_t rt : 3; - uint32_t has_constants : 1; - uint32_t logicop_enable : 1; - uint32_t logicop_func:4; - uint32_t nr_samples : 5; - uint32_t padding : 18; - struct pan_blend_equation equation; + enum pipe_format format; + nir_alu_type src0_type, src1_type; + uint32_t rt : 3; + uint32_t has_constants : 1; + uint32_t logicop_enable : 1; + uint32_t logicop_func : 4; + uint32_t nr_samples : 5; + uint32_t padding : 18; + struct pan_blend_equation equation; }; struct pan_blend_shader_variant { - struct list_head node; - float constants[4]; - struct util_dynarray binary; - unsigned first_tag; - unsigned work_reg_count; + struct list_head node; + float constants[4]; + struct util_dynarray binary; + unsigned first_tag; + unsigned work_reg_count; }; #define PAN_BLEND_SHADER_MAX_VARIANTS 32 struct pan_blend_shader { - struct pan_blend_shader_key key; - unsigned nvariants; - struct list_head variants; + struct pan_blend_shader_key key; + unsigned nvariants; + struct list_head variants; }; -bool -pan_blend_reads_dest(const struct pan_blend_equation eq); +bool pan_blend_reads_dest(const struct pan_blend_equation eq); -bool -pan_blend_can_fixed_function(const struct pan_blend_equation equation, - bool supports_2src); +bool pan_blend_can_fixed_function(const struct pan_blend_equation equation, + bool supports_2src); -bool -pan_blend_is_opaque(const struct pan_blend_equation eq); +bool pan_blend_is_opaque(const struct pan_blend_equation eq); -bool -pan_blend_alpha_zero_nop(const struct pan_blend_equation eq); +bool pan_blend_alpha_zero_nop(const struct pan_blend_equation eq); -bool -pan_blend_alpha_one_store(const struct pan_blend_equation eq); +bool pan_blend_alpha_one_store(const struct pan_blend_equation eq); -unsigned -pan_blend_constant_mask(const struct pan_blend_equation eq); +unsigned pan_blend_constant_mask(const struct pan_blend_equation eq); /* Fixed-function blending only supports a single constant, so if multiple bits * are set in constant_mask, the constants must match. Therefore we may pick @@ -124,7 +118,7 @@ pan_blend_constant_mask(const struct pan_blend_equation eq); static inline float pan_blend_get_constant(unsigned mask, const float *constants) { - return mask ? constants[ffs(mask) - 1] : 0.0; + return mask ? constants[ffs(mask) - 1] : 0.0; } /* v6 doesn't support blend constants in FF blend equations whatsoever, and v7 @@ -134,7 +128,7 @@ pan_blend_get_constant(unsigned mask, const float *constants) static inline bool pan_blend_supports_constant(unsigned arch, unsigned rt) { - return !((arch == 6) || (arch == 7 && rt > 0)); + return !((arch == 6) || (arch == 7 && rt > 0)); } /* The SOURCE_2 value is new in Bifrost */ @@ -142,50 +136,39 @@ pan_blend_supports_constant(unsigned arch, unsigned rt) static inline bool pan_blend_supports_2src(unsigned arch) { - return (arch >= 6); + return (arch >= 6); } -bool -pan_blend_is_homogenous_constant(unsigned mask, const float *constants); +bool pan_blend_is_homogenous_constant(unsigned mask, const float *constants); -void -pan_blend_to_fixed_function_equation(const struct pan_blend_equation eq, - struct MALI_BLEND_EQUATION *equation); +void pan_blend_to_fixed_function_equation(const struct pan_blend_equation eq, + struct MALI_BLEND_EQUATION *equation); -uint32_t -pan_pack_blend(const struct pan_blend_equation equation); +uint32_t pan_pack_blend(const struct pan_blend_equation equation); -void -pan_blend_shaders_init(struct panfrost_device *dev); +void pan_blend_shaders_init(struct panfrost_device *dev); -void -pan_blend_shaders_cleanup(struct panfrost_device *dev); +void pan_blend_shaders_cleanup(struct panfrost_device *dev); #ifdef PAN_ARCH -nir_shader * -GENX(pan_blend_create_shader)(const struct panfrost_device *dev, - const struct pan_blend_state *state, - nir_alu_type src0_type, - nir_alu_type src1_type, - unsigned rt); +nir_shader *GENX(pan_blend_create_shader)(const struct panfrost_device *dev, + const struct pan_blend_state *state, + nir_alu_type src0_type, + nir_alu_type src1_type, unsigned rt); #if PAN_ARCH >= 6 -uint64_t -GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev, - enum pipe_format fmt, unsigned rt, - unsigned force_size, bool dithered); +uint64_t GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev, + enum pipe_format fmt, unsigned rt, + unsigned force_size, bool dithered); #endif /* Take blend_shaders.lock before calling this function and release it when * you're done with the shader variant object. */ -struct pan_blend_shader_variant * -GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev, - const struct pan_blend_state *state, - nir_alu_type src0_type, - nir_alu_type src1_type, - unsigned rt); +struct pan_blend_shader_variant *GENX(pan_blend_get_shader_locked)( + const struct panfrost_device *dev, const struct pan_blend_state *state, + nir_alu_type src0_type, nir_alu_type src1_type, unsigned rt); #endif #endif diff --git a/src/panfrost/lib/pan_blitter.c b/src/panfrost/lib/pan_blitter.c index e2e2342b5e4..2705bf6acc1 100644 --- a/src/panfrost/lib/pan_blitter.c +++ b/src/panfrost/lib/pan_blitter.c @@ -25,18 +25,18 @@ * Boris Brezillon */ +#include "pan_blitter.h" #include #include +#include "compiler/nir/nir_builder.h" +#include "util/u_math.h" #include "pan_blend.h" -#include "pan_blitter.h" #include "pan_cs.h" #include "pan_encoder.h" #include "pan_pool.h" -#include "pan_shader.h" #include "pan_scoreboard.h" +#include "pan_shader.h" #include "pan_texture.h" -#include "compiler/nir/nir_builder.h" -#include "util/u_math.h" #if PAN_ARCH >= 6 /* On Midgard, the native blit infrastructure (via MFBD preloads) is broken or @@ -50,153 +50,151 @@ static enum mali_register_file_format blit_type_to_reg_fmt(nir_alu_type in) { - switch (in) { - case nir_type_float32: - return MALI_REGISTER_FILE_FORMAT_F32; - case nir_type_int32: - return MALI_REGISTER_FILE_FORMAT_I32; - case nir_type_uint32: - return MALI_REGISTER_FILE_FORMAT_U32; - default: - unreachable("Invalid blit type"); - } + switch (in) { + case nir_type_float32: + return MALI_REGISTER_FILE_FORMAT_F32; + case nir_type_int32: + return MALI_REGISTER_FILE_FORMAT_I32; + case nir_type_uint32: + return MALI_REGISTER_FILE_FORMAT_U32; + default: + unreachable("Invalid blit type"); + } } #endif struct pan_blit_surface { - gl_frag_result loc : 4; - nir_alu_type type : 8; - enum mali_texture_dimension dim : 2; - bool array : 1; - unsigned src_samples: 5; - unsigned dst_samples: 5; + gl_frag_result loc : 4; + nir_alu_type type : 8; + enum mali_texture_dimension dim : 2; + bool array : 1; + unsigned src_samples : 5; + unsigned dst_samples : 5; }; struct pan_blit_shader_key { - struct pan_blit_surface surfaces[8]; + struct pan_blit_surface surfaces[8]; }; struct pan_blit_shader_data { - struct pan_blit_shader_key key; - struct pan_shader_info info; - mali_ptr address; - unsigned blend_ret_offsets[8]; - nir_alu_type blend_types[8]; + struct pan_blit_shader_key key; + struct pan_shader_info info; + mali_ptr address; + unsigned blend_ret_offsets[8]; + nir_alu_type blend_types[8]; }; struct pan_blit_blend_shader_key { - enum pipe_format format; - nir_alu_type type; - unsigned rt : 3; - unsigned nr_samples : 5; - unsigned pad : 24; + enum pipe_format format; + nir_alu_type type; + unsigned rt : 3; + unsigned nr_samples : 5; + unsigned pad : 24; }; struct pan_blit_blend_shader_data { - struct pan_blit_blend_shader_key key; - mali_ptr address; + struct pan_blit_blend_shader_key key; + mali_ptr address; }; struct pan_blit_rsd_key { - struct { - enum pipe_format format; - nir_alu_type type : 8; - unsigned src_samples : 5; - unsigned dst_samples : 5; - enum mali_texture_dimension dim : 2; - bool array : 1; - } rts[8], z, s; + struct { + enum pipe_format format; + nir_alu_type type : 8; + unsigned src_samples : 5; + unsigned dst_samples : 5; + enum mali_texture_dimension dim : 2; + bool array : 1; + } rts[8], z, s; }; struct pan_blit_rsd_data { - struct pan_blit_rsd_key key; - mali_ptr address; + struct pan_blit_rsd_key key; + mali_ptr address; }; #if PAN_ARCH >= 5 static void -pan_blitter_emit_blend(const struct panfrost_device *dev, - unsigned rt, +pan_blitter_emit_blend(const struct panfrost_device *dev, unsigned rt, const struct pan_image_view *iview, const struct pan_blit_shader_data *blit_shader, - mali_ptr blend_shader, - void *out) + mali_ptr blend_shader, void *out) { - assert(blend_shader == 0 || PAN_ARCH <= 5); + assert(blend_shader == 0 || PAN_ARCH <= 5); - pan_pack(out, BLEND, cfg) { - if (!iview) { - cfg.enable = false; + pan_pack(out, BLEND, cfg) { + if (!iview) { + cfg.enable = false; #if PAN_ARCH >= 6 - cfg.internal.mode = MALI_BLEND_MODE_OFF; + cfg.internal.mode = MALI_BLEND_MODE_OFF; #endif - continue; - } + continue; + } - cfg.round_to_fb_precision = true; - cfg.srgb = util_format_is_srgb(iview->format); + cfg.round_to_fb_precision = true; + cfg.srgb = util_format_is_srgb(iview->format); #if PAN_ARCH >= 6 - cfg.internal.mode = MALI_BLEND_MODE_OPAQUE; + cfg.internal.mode = MALI_BLEND_MODE_OPAQUE; #endif - if (!blend_shader) { - cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; - cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; - cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; - cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; - cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; - cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; - cfg.equation.color_mask = 0xf; + if (!blend_shader) { + cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; + cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; + cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; + cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; + cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; + cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; + cfg.equation.color_mask = 0xf; #if PAN_ARCH >= 6 - nir_alu_type type = blit_shader->key.surfaces[rt].type; + nir_alu_type type = blit_shader->key.surfaces[rt].type; - cfg.internal.fixed_function.num_comps = 4; - cfg.internal.fixed_function.conversion.memory_format = - panfrost_format_to_bifrost_blend(dev, iview->format, false); - cfg.internal.fixed_function.conversion.register_format = - blit_type_to_reg_fmt(type); + cfg.internal.fixed_function.num_comps = 4; + cfg.internal.fixed_function.conversion.memory_format = + panfrost_format_to_bifrost_blend(dev, iview->format, false); + cfg.internal.fixed_function.conversion.register_format = + blit_type_to_reg_fmt(type); - cfg.internal.fixed_function.rt = rt; + cfg.internal.fixed_function.rt = rt; #endif - } else { + } else { #if PAN_ARCH <= 5 - cfg.blend_shader = true; - cfg.shader_pc = blend_shader; + cfg.blend_shader = true; + cfg.shader_pc = blend_shader; #endif - } - } + } + } } #endif struct pan_blitter_views { - unsigned rt_count; - const struct pan_image_view *src_rts[8]; - const struct pan_image_view *dst_rts[8]; - const struct pan_image_view *src_z; - const struct pan_image_view *dst_z; - const struct pan_image_view *src_s; - const struct pan_image_view *dst_s; + unsigned rt_count; + const struct pan_image_view *src_rts[8]; + const struct pan_image_view *dst_rts[8]; + const struct pan_image_view *src_z; + const struct pan_image_view *dst_z; + const struct pan_image_view *src_s; + const struct pan_image_view *dst_s; }; static bool pan_blitter_is_ms(struct pan_blitter_views *views) { - for (unsigned i = 0; i < views->rt_count; i++) { - if (views->dst_rts[i]) { - if (views->dst_rts[i]->image->layout.nr_samples > 1) - return true; - } - } + for (unsigned i = 0; i < views->rt_count; i++) { + if (views->dst_rts[i]) { + if (views->dst_rts[i]->image->layout.nr_samples > 1) + return true; + } + } - if (views->dst_z && views->dst_z->image->layout.nr_samples > 1) - return true; + if (views->dst_z && views->dst_z->image->layout.nr_samples > 1) + return true; - if (views->dst_s && views->dst_s->image->layout.nr_samples > 1) - return true; + if (views->dst_s && views->dst_s->image->layout.nr_samples > 1) + return true; - return false; + return false; } #if PAN_ARCH >= 5 @@ -204,17 +202,15 @@ static void pan_blitter_emit_blends(const struct panfrost_device *dev, const struct pan_blit_shader_data *blit_shader, struct pan_blitter_views *views, - mali_ptr *blend_shaders, - void *out) + mali_ptr *blend_shaders, void *out) { - for (unsigned i = 0; i < MAX2(views->rt_count, 1); ++i) { - void *dest = out + pan_size(BLEND) * i; - const struct pan_image_view *rt_view = views->dst_rts[i]; - mali_ptr blend_shader = blend_shaders ? blend_shaders[i] : 0; + for (unsigned i = 0; i < MAX2(views->rt_count, 1); ++i) { + void *dest = out + pan_size(BLEND) * i; + const struct pan_image_view *rt_view = views->dst_rts[i]; + mali_ptr blend_shader = blend_shaders ? blend_shaders[i] : 0; - pan_blitter_emit_blend(dev, i, rt_view, blit_shader, - blend_shader, dest); - } + pan_blitter_emit_blend(dev, i, rt_view, blit_shader, blend_shader, dest); + } } #endif @@ -222,169 +218,163 @@ pan_blitter_emit_blends(const struct panfrost_device *dev, static void pan_blitter_emit_rsd(const struct panfrost_device *dev, const struct pan_blit_shader_data *blit_shader, - struct pan_blitter_views *views, - mali_ptr *blend_shaders, + struct pan_blitter_views *views, mali_ptr *blend_shaders, void *out) { - UNUSED bool zs = (views->dst_z || views->dst_s); - bool ms = pan_blitter_is_ms(views); + UNUSED bool zs = (views->dst_z || views->dst_s); + bool ms = pan_blitter_is_ms(views); - pan_pack(out, RENDERER_STATE, cfg) { - assert(blit_shader->address); - pan_shader_prepare_rsd(&blit_shader->info, blit_shader->address, &cfg); + pan_pack(out, RENDERER_STATE, cfg) { + assert(blit_shader->address); + pan_shader_prepare_rsd(&blit_shader->info, blit_shader->address, &cfg); - cfg.multisample_misc.sample_mask = 0xFFFF; - cfg.multisample_misc.multisample_enable = ms; - cfg.multisample_misc.evaluate_per_sample = ms; - cfg.multisample_misc.depth_write_mask = views->dst_z != NULL; - cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS; + cfg.multisample_misc.sample_mask = 0xFFFF; + cfg.multisample_misc.multisample_enable = ms; + cfg.multisample_misc.evaluate_per_sample = ms; + cfg.multisample_misc.depth_write_mask = views->dst_z != NULL; + cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS; - cfg.stencil_mask_misc.stencil_enable = views->dst_s != NULL; - cfg.stencil_mask_misc.stencil_mask_front = 0xFF; - cfg.stencil_mask_misc.stencil_mask_back = 0xFF; - cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS; - cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE; - cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE; - cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE; - cfg.stencil_front.mask = 0xFF; - cfg.stencil_back = cfg.stencil_front; + cfg.stencil_mask_misc.stencil_enable = views->dst_s != NULL; + cfg.stencil_mask_misc.stencil_mask_front = 0xFF; + cfg.stencil_mask_misc.stencil_mask_back = 0xFF; + cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS; + cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE; + cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE; + cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE; + cfg.stencil_front.mask = 0xFF; + cfg.stencil_back = cfg.stencil_front; #if PAN_ARCH >= 6 - if (zs) { - /* Writing Z/S requires late updates */ - cfg.properties.zs_update_operation = - MALI_PIXEL_KILL_FORCE_LATE; - cfg.properties.pixel_kill_operation = - MALI_PIXEL_KILL_FORCE_LATE; - } else { - /* Skipping ATEST requires forcing Z/S */ - cfg.properties.zs_update_operation = - MALI_PIXEL_KILL_STRONG_EARLY; - cfg.properties.pixel_kill_operation = - MALI_PIXEL_KILL_FORCE_EARLY; - } + if (zs) { + /* Writing Z/S requires late updates */ + cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; + cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE; + } else { + /* Skipping ATEST requires forcing Z/S */ + cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; + cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; + } - /* However, while shaders writing Z/S can normally be killed, on v6 - * for frame shaders it can cause GPU timeouts, so only allow colour - * blit shaders to be killed. */ - cfg.properties.allow_forward_pixel_to_kill = !zs; + /* However, while shaders writing Z/S can normally be killed, on v6 + * for frame shaders it can cause GPU timeouts, so only allow colour + * blit shaders to be killed. */ + cfg.properties.allow_forward_pixel_to_kill = !zs; - if (PAN_ARCH == 6) - cfg.properties.allow_forward_pixel_to_be_killed = !zs; + if (PAN_ARCH == 6) + cfg.properties.allow_forward_pixel_to_be_killed = !zs; #else - mali_ptr blend_shader = blend_shaders ? - panfrost_last_nonnull(blend_shaders, MAX2(views->rt_count, 1)) : 0; + mali_ptr blend_shader = + blend_shaders + ? panfrost_last_nonnull(blend_shaders, MAX2(views->rt_count, 1)) + : 0; - cfg.properties.work_register_count = 4; - cfg.properties.force_early_z = !zs; - cfg.stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS; + cfg.properties.work_register_count = 4; + cfg.properties.force_early_z = !zs; + cfg.stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS; - /* Set even on v5 for erratum workaround */ + /* Set even on v5 for erratum workaround */ #if PAN_ARCH == 5 - cfg.legacy_blend_shader = blend_shader; + cfg.legacy_blend_shader = blend_shader; #else - cfg.blend_shader = blend_shader; - cfg.stencil_mask_misc.write_enable = true; - cfg.stencil_mask_misc.dither_disable = true; - cfg.multisample_misc.blend_shader = !!blend_shader; - cfg.blend_shader = blend_shader; - if (!cfg.multisample_misc.blend_shader) { - cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; - cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; - cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; - cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; - cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; - cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; - cfg.blend_constant = 0; + cfg.blend_shader = blend_shader; + cfg.stencil_mask_misc.write_enable = true; + cfg.stencil_mask_misc.dither_disable = true; + cfg.multisample_misc.blend_shader = !!blend_shader; + cfg.blend_shader = blend_shader; + if (!cfg.multisample_misc.blend_shader) { + cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; + cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; + cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; + cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; + cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; + cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; + cfg.blend_constant = 0; - if (views->dst_rts[0] != NULL) { - cfg.stencil_mask_misc.srgb = - util_format_is_srgb(views->dst_rts[0]->format); - cfg.blend_equation.color_mask = 0xf; - } - } + if (views->dst_rts[0] != NULL) { + cfg.stencil_mask_misc.srgb = + util_format_is_srgb(views->dst_rts[0]->format); + cfg.blend_equation.color_mask = 0xf; + } + } #endif #endif - } + } #if PAN_ARCH >= 5 - pan_blitter_emit_blends(dev, blit_shader, views, blend_shaders, - out + pan_size(RENDERER_STATE)); + pan_blitter_emit_blends(dev, blit_shader, views, blend_shaders, + out + pan_size(RENDERER_STATE)); #endif } #endif static void -pan_blitter_get_blend_shaders(struct panfrost_device *dev, - unsigned rt_count, +pan_blitter_get_blend_shaders(struct panfrost_device *dev, unsigned rt_count, const struct pan_image_view **rts, const struct pan_blit_shader_data *blit_shader, mali_ptr *blend_shaders) { #if PAN_ARCH <= 5 - if (!rt_count) - return; + if (!rt_count) + return; - struct pan_blend_state blend_state = { - .rt_count = rt_count, - }; + struct pan_blend_state blend_state = { + .rt_count = rt_count, + }; - for (unsigned i = 0; i < rt_count; i++) { - if (!rts[i] || panfrost_blendable_formats_v7[rts[i]->format].internal) - continue; + for (unsigned i = 0; i < rt_count; i++) { + if (!rts[i] || panfrost_blendable_formats_v7[rts[i]->format].internal) + continue; - struct pan_blit_blend_shader_key key = { - .format = rts[i]->format, - .rt = i, - .nr_samples = rts[i]->image->layout.nr_samples, - .type = blit_shader->blend_types[i], - }; + struct pan_blit_blend_shader_key key = { + .format = rts[i]->format, + .rt = i, + .nr_samples = rts[i]->image->layout.nr_samples, + .type = blit_shader->blend_types[i], + }; - pthread_mutex_lock(&dev->blitter.shaders.lock); - struct hash_entry *he = - _mesa_hash_table_search(dev->blitter.shaders.blend, &key); - struct pan_blit_blend_shader_data *blend_shader = he ? he->data : NULL; - if (blend_shader) { - blend_shaders[i] = blend_shader->address; - pthread_mutex_unlock(&dev->blitter.shaders.lock); - continue; - } + pthread_mutex_lock(&dev->blitter.shaders.lock); + struct hash_entry *he = + _mesa_hash_table_search(dev->blitter.shaders.blend, &key); + struct pan_blit_blend_shader_data *blend_shader = he ? he->data : NULL; + if (blend_shader) { + blend_shaders[i] = blend_shader->address; + pthread_mutex_unlock(&dev->blitter.shaders.lock); + continue; + } - blend_shader = rzalloc(dev->blitter.shaders.blend, - struct pan_blit_blend_shader_data); - blend_shader->key = key; + blend_shader = + rzalloc(dev->blitter.shaders.blend, struct pan_blit_blend_shader_data); + blend_shader->key = key; - blend_state.rts[i] = (struct pan_blend_rt_state) { - .format = rts[i]->format, - .nr_samples = rts[i]->image->layout.nr_samples, - .equation = { - .blend_enable = false, - .color_mask = 0xf, - }, - }; + blend_state.rts[i] = (struct pan_blend_rt_state){ + .format = rts[i]->format, + .nr_samples = rts[i]->image->layout.nr_samples, + .equation = + { + .blend_enable = false, + .color_mask = 0xf, + }, + }; - pthread_mutex_lock(&dev->blend_shaders.lock); - struct pan_blend_shader_variant *b = - GENX(pan_blend_get_shader_locked)(dev, &blend_state, - blit_shader->blend_types[i], - nir_type_float32, /* unused */ - i); + pthread_mutex_lock(&dev->blend_shaders.lock); + struct pan_blend_shader_variant *b = GENX(pan_blend_get_shader_locked)( + dev, &blend_state, blit_shader->blend_types[i], + nir_type_float32, /* unused */ + i); - assert(b->work_reg_count <= 4); - struct panfrost_ptr bin = - pan_pool_alloc_aligned(dev->blitter.shaders.pool, - b->binary.size, - 64); - memcpy(bin.cpu, b->binary.data, b->binary.size); + assert(b->work_reg_count <= 4); + struct panfrost_ptr bin = + pan_pool_alloc_aligned(dev->blitter.shaders.pool, b->binary.size, 64); + memcpy(bin.cpu, b->binary.data, b->binary.size); - blend_shader->address = bin.gpu | b->first_tag; - pthread_mutex_unlock(&dev->blend_shaders.lock); - _mesa_hash_table_insert(dev->blitter.shaders.blend, - &blend_shader->key, blend_shader); - pthread_mutex_unlock(&dev->blitter.shaders.lock); - blend_shaders[i] = blend_shader->address; - } + blend_shader->address = bin.gpu | b->first_tag; + pthread_mutex_unlock(&dev->blend_shaders.lock); + _mesa_hash_table_insert(dev->blitter.shaders.blend, &blend_shader->key, + blend_shader); + pthread_mutex_unlock(&dev->blitter.shaders.lock); + blend_shaders[i] = blend_shader->address; + } #endif } @@ -392,287 +382,300 @@ static const struct pan_blit_shader_data * pan_blitter_get_blit_shader(struct panfrost_device *dev, const struct pan_blit_shader_key *key) { - pthread_mutex_lock(&dev->blitter.shaders.lock); - struct hash_entry *he = _mesa_hash_table_search(dev->blitter.shaders.blit, key); - struct pan_blit_shader_data *shader = he ? he->data : NULL; + pthread_mutex_lock(&dev->blitter.shaders.lock); + struct hash_entry *he = + _mesa_hash_table_search(dev->blitter.shaders.blit, key); + struct pan_blit_shader_data *shader = he ? he->data : NULL; - if (shader) - goto out; + if (shader) + goto out; - unsigned coord_comps = 0; - unsigned sig_offset = 0; - char sig[256]; - bool first = true; - for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) { - const char *type_str, *dim_str; - if (key->surfaces[i].type == nir_type_invalid) - continue; + unsigned coord_comps = 0; + unsigned sig_offset = 0; + char sig[256]; + bool first = true; + for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) { + const char *type_str, *dim_str; + if (key->surfaces[i].type == nir_type_invalid) + continue; - switch (key->surfaces[i].type) { - case nir_type_float32: type_str = "float"; break; - case nir_type_uint32: type_str = "uint"; break; - case nir_type_int32: type_str = "int"; break; - default: unreachable("Invalid type\n"); - } + switch (key->surfaces[i].type) { + case nir_type_float32: + type_str = "float"; + break; + case nir_type_uint32: + type_str = "uint"; + break; + case nir_type_int32: + type_str = "int"; + break; + default: + unreachable("Invalid type\n"); + } - switch (key->surfaces[i].dim) { - case MALI_TEXTURE_DIMENSION_CUBE: dim_str = "cube"; break; - case MALI_TEXTURE_DIMENSION_1D: dim_str = "1D"; break; - case MALI_TEXTURE_DIMENSION_2D: dim_str = "2D"; break; - case MALI_TEXTURE_DIMENSION_3D: dim_str = "3D"; break; - default: unreachable("Invalid dim\n"); - } + switch (key->surfaces[i].dim) { + case MALI_TEXTURE_DIMENSION_CUBE: + dim_str = "cube"; + break; + case MALI_TEXTURE_DIMENSION_1D: + dim_str = "1D"; + break; + case MALI_TEXTURE_DIMENSION_2D: + dim_str = "2D"; + break; + case MALI_TEXTURE_DIMENSION_3D: + dim_str = "3D"; + break; + default: + unreachable("Invalid dim\n"); + } - coord_comps = MAX2(coord_comps, - (key->surfaces[i].dim ? : 3) + - (key->surfaces[i].array ? 1 : 0)); - first = false; + coord_comps = MAX2(coord_comps, (key->surfaces[i].dim ?: 3) + + (key->surfaces[i].array ? 1 : 0)); + first = false; - if (sig_offset >= sizeof(sig)) - continue; + if (sig_offset >= sizeof(sig)) + continue; - sig_offset += snprintf(sig + sig_offset, sizeof(sig) - sig_offset, - "%s[%s;%s;%s%s;src_samples=%d,dst_samples=%d]", - first ? "" : ",", - gl_frag_result_name(key->surfaces[i].loc), - type_str, dim_str, - key->surfaces[i].array ? "[]" : "", - key->surfaces[i].src_samples, - key->surfaces[i].dst_samples); - } + sig_offset += + snprintf(sig + sig_offset, sizeof(sig) - sig_offset, + "%s[%s;%s;%s%s;src_samples=%d,dst_samples=%d]", + first ? "" : ",", gl_frag_result_name(key->surfaces[i].loc), + type_str, dim_str, key->surfaces[i].array ? "[]" : "", + key->surfaces[i].src_samples, key->surfaces[i].dst_samples); + } - nir_builder b = - nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, - GENX(pan_shader_get_compiler_options)(), - "pan_blit(%s)", sig); - nir_variable *coord_var = - nir_variable_create(b.shader, nir_var_shader_in, - glsl_vector_type(GLSL_TYPE_FLOAT, coord_comps), - "coord"); - coord_var->data.location = VARYING_SLOT_VAR0; + nir_builder b = nir_builder_init_simple_shader( + MESA_SHADER_FRAGMENT, GENX(pan_shader_get_compiler_options)(), + "pan_blit(%s)", sig); + nir_variable *coord_var = nir_variable_create( + b.shader, nir_var_shader_in, + glsl_vector_type(GLSL_TYPE_FLOAT, coord_comps), "coord"); + coord_var->data.location = VARYING_SLOT_VAR0; - nir_ssa_def *coord = nir_load_var(&b, coord_var); + nir_ssa_def *coord = nir_load_var(&b, coord_var); - unsigned active_count = 0; - for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) { - if (key->surfaces[i].type == nir_type_invalid) - continue; + unsigned active_count = 0; + for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) { + if (key->surfaces[i].type == nir_type_invalid) + continue; - /* Resolve operations only work for N -> 1 samples. */ - assert(key->surfaces[i].dst_samples == 1 || - key->surfaces[i].src_samples == key->surfaces[i].dst_samples); + /* Resolve operations only work for N -> 1 samples. */ + assert(key->surfaces[i].dst_samples == 1 || + key->surfaces[i].src_samples == key->surfaces[i].dst_samples); - static const char *out_names[] = { - "out0", "out1", "out2", "out3", "out4", "out5", "out6", "out7", - }; + static const char *out_names[] = { + "out0", "out1", "out2", "out3", "out4", "out5", "out6", "out7", + }; - unsigned ncomps = key->surfaces[i].loc >= FRAG_RESULT_DATA0 ? 4 : 1; - enum glsl_base_type type = nir_get_glsl_base_type_for_nir_type(key->surfaces[i].type); - nir_variable *out = - nir_variable_create(b.shader, nir_var_shader_out, - glsl_vector_type(type, ncomps), - out_names[active_count]); - out->data.location = key->surfaces[i].loc; - out->data.driver_location = active_count; + unsigned ncomps = key->surfaces[i].loc >= FRAG_RESULT_DATA0 ? 4 : 1; + enum glsl_base_type type = + nir_get_glsl_base_type_for_nir_type(key->surfaces[i].type); + nir_variable *out = nir_variable_create(b.shader, nir_var_shader_out, + glsl_vector_type(type, ncomps), + out_names[active_count]); + out->data.location = key->surfaces[i].loc; + out->data.driver_location = active_count; - bool resolve = key->surfaces[i].src_samples > key->surfaces[i].dst_samples; - bool ms = key->surfaces[i].src_samples > 1; - enum glsl_sampler_dim sampler_dim; + bool resolve = + key->surfaces[i].src_samples > key->surfaces[i].dst_samples; + bool ms = key->surfaces[i].src_samples > 1; + enum glsl_sampler_dim sampler_dim; - switch (key->surfaces[i].dim) { - case MALI_TEXTURE_DIMENSION_1D: - sampler_dim = GLSL_SAMPLER_DIM_1D; - break; - case MALI_TEXTURE_DIMENSION_2D: - sampler_dim = ms ? - GLSL_SAMPLER_DIM_MS : - GLSL_SAMPLER_DIM_2D; - break; - case MALI_TEXTURE_DIMENSION_3D: - sampler_dim = GLSL_SAMPLER_DIM_3D; - break; - case MALI_TEXTURE_DIMENSION_CUBE: - sampler_dim = GLSL_SAMPLER_DIM_CUBE; - break; - } + switch (key->surfaces[i].dim) { + case MALI_TEXTURE_DIMENSION_1D: + sampler_dim = GLSL_SAMPLER_DIM_1D; + break; + case MALI_TEXTURE_DIMENSION_2D: + sampler_dim = ms ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D; + break; + case MALI_TEXTURE_DIMENSION_3D: + sampler_dim = GLSL_SAMPLER_DIM_3D; + break; + case MALI_TEXTURE_DIMENSION_CUBE: + sampler_dim = GLSL_SAMPLER_DIM_CUBE; + break; + } - nir_ssa_def *res = NULL; + nir_ssa_def *res = NULL; - if (resolve) { - /* When resolving a float type, we need to calculate - * the average of all samples. For integer resolve, GL - * and Vulkan say that one sample should be chosen - * without telling which. Let's just pick the first one - * in that case. - */ - nir_alu_type base_type = - nir_alu_type_get_base_type(key->surfaces[i].type); - unsigned nsamples = base_type == nir_type_float ? - key->surfaces[i].src_samples : 1; + if (resolve) { + /* When resolving a float type, we need to calculate + * the average of all samples. For integer resolve, GL + * and Vulkan say that one sample should be chosen + * without telling which. Let's just pick the first one + * in that case. + */ + nir_alu_type base_type = + nir_alu_type_get_base_type(key->surfaces[i].type); + unsigned nsamples = + base_type == nir_type_float ? key->surfaces[i].src_samples : 1; - for (unsigned s = 0; s < nsamples; s++) { - nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3); + for (unsigned s = 0; s < nsamples; s++) { + nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3); - tex->op = nir_texop_txf_ms; - tex->dest_type = key->surfaces[i].type; - tex->texture_index = active_count; - tex->is_array = key->surfaces[i].array; - tex->sampler_dim = sampler_dim; + tex->op = nir_texop_txf_ms; + tex->dest_type = key->surfaces[i].type; + tex->texture_index = active_count; + tex->is_array = key->surfaces[i].array; + tex->sampler_dim = sampler_dim; - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(nir_f2i32(&b, coord)); - tex->coord_components = coord_comps; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(nir_f2i32(&b, coord)); + tex->coord_components = coord_comps; - tex->src[1].src_type = nir_tex_src_ms_index; - tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, s)); + tex->src[1].src_type = nir_tex_src_ms_index; + tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, s)); - tex->src[2].src_type = nir_tex_src_lod; - tex->src[2].src = nir_src_for_ssa(nir_imm_int(&b, 0)); - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); - nir_builder_instr_insert(&b, &tex->instr); + tex->src[2].src_type = nir_tex_src_lod; + tex->src[2].src = nir_src_for_ssa(nir_imm_int(&b, 0)); + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); + nir_builder_instr_insert(&b, &tex->instr); - res = res ? nir_fadd(&b, res, &tex->dest.ssa) : &tex->dest.ssa; - } + res = res ? nir_fadd(&b, res, &tex->dest.ssa) : &tex->dest.ssa; + } - if (base_type == nir_type_float) { - unsigned type_sz = - nir_alu_type_get_type_size(key->surfaces[i].type); - res = nir_fmul(&b, res, - nir_imm_floatN_t(&b, 1.0f / nsamples, type_sz)); - } - } else { - nir_tex_instr *tex = - nir_tex_instr_create(b.shader, ms ? 3 : 1); + if (base_type == nir_type_float) { + unsigned type_sz = + nir_alu_type_get_type_size(key->surfaces[i].type); + res = nir_fmul(&b, res, + nir_imm_floatN_t(&b, 1.0f / nsamples, type_sz)); + } + } else { + nir_tex_instr *tex = nir_tex_instr_create(b.shader, ms ? 3 : 1); - tex->dest_type = key->surfaces[i].type; - tex->texture_index = active_count; - tex->is_array = key->surfaces[i].array; - tex->sampler_dim = sampler_dim; + tex->dest_type = key->surfaces[i].type; + tex->texture_index = active_count; + tex->is_array = key->surfaces[i].array; + tex->sampler_dim = sampler_dim; - if (ms) { - tex->op = nir_texop_txf_ms; + if (ms) { + tex->op = nir_texop_txf_ms; - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(nir_f2i32(&b, coord)); - tex->coord_components = coord_comps; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(nir_f2i32(&b, coord)); + tex->coord_components = coord_comps; - tex->src[1].src_type = nir_tex_src_ms_index; - tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(&b)); + tex->src[1].src_type = nir_tex_src_ms_index; + tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(&b)); - tex->src[2].src_type = nir_tex_src_lod; - tex->src[2].src = nir_src_for_ssa(nir_imm_int(&b, 0)); - } else { - tex->op = nir_texop_txl; + tex->src[2].src_type = nir_tex_src_lod; + tex->src[2].src = nir_src_for_ssa(nir_imm_int(&b, 0)); + } else { + tex->op = nir_texop_txl; - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(coord); - tex->coord_components = coord_comps; - } + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(coord); + tex->coord_components = coord_comps; + } - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); - nir_builder_instr_insert(&b, &tex->instr); - res = &tex->dest.ssa; - } + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); + nir_builder_instr_insert(&b, &tex->instr); + res = &tex->dest.ssa; + } - assert(res); + assert(res); - if (key->surfaces[i].loc >= FRAG_RESULT_DATA0) { - nir_store_var(&b, out, res, 0xFF); - } else { - unsigned c = key->surfaces[i].loc == FRAG_RESULT_STENCIL ? 1 : 0; - nir_store_var(&b, out, nir_channel(&b, res, c), 0xFF); - } - active_count++; - } + if (key->surfaces[i].loc >= FRAG_RESULT_DATA0) { + nir_store_var(&b, out, res, 0xFF); + } else { + unsigned c = key->surfaces[i].loc == FRAG_RESULT_STENCIL ? 1 : 0; + nir_store_var(&b, out, nir_channel(&b, res, c), 0xFF); + } + active_count++; + } - struct panfrost_compile_inputs inputs = { - .gpu_id = dev->gpu_id, - .is_blit = true, - .no_idvs = true, - .fixed_sysval_ubo = -1, - }; - struct util_dynarray binary; + struct panfrost_compile_inputs inputs = { + .gpu_id = dev->gpu_id, + .is_blit = true, + .no_idvs = true, + .fixed_sysval_ubo = -1, + }; + struct util_dynarray binary; - util_dynarray_init(&binary, NULL); + util_dynarray_init(&binary, NULL); - shader = rzalloc(dev->blitter.shaders.blit, - struct pan_blit_shader_data); + shader = rzalloc(dev->blitter.shaders.blit, struct pan_blit_shader_data); - nir_shader_gather_info(b.shader, nir_shader_get_entrypoint(b.shader)); + nir_shader_gather_info(b.shader, nir_shader_get_entrypoint(b.shader)); - for (unsigned i = 0; i < active_count; ++i) - BITSET_SET(b.shader->info.textures_used, i); + for (unsigned i = 0; i < active_count; ++i) + BITSET_SET(b.shader->info.textures_used, i); - GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader->info); + GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader->info); - /* Blit shaders shouldn't have sysvals */ - assert(shader->info.sysvals.sysval_count == 0); + /* Blit shaders shouldn't have sysvals */ + assert(shader->info.sysvals.sysval_count == 0); - shader->key = *key; - shader->address = - pan_pool_upload_aligned(dev->blitter.shaders.pool, - binary.data, binary.size, - PAN_ARCH >= 6 ? 128 : 64); + shader->key = *key; + shader->address = + pan_pool_upload_aligned(dev->blitter.shaders.pool, binary.data, + binary.size, PAN_ARCH >= 6 ? 128 : 64); - util_dynarray_fini(&binary); - ralloc_free(b.shader); + util_dynarray_fini(&binary); + ralloc_free(b.shader); #if PAN_ARCH >= 6 - for (unsigned i = 0; i < ARRAY_SIZE(shader->blend_ret_offsets); i++) { - shader->blend_ret_offsets[i] = shader->info.bifrost.blend[i].return_offset; - shader->blend_types[i] = shader->info.bifrost.blend[i].type; - } + for (unsigned i = 0; i < ARRAY_SIZE(shader->blend_ret_offsets); i++) { + shader->blend_ret_offsets[i] = + shader->info.bifrost.blend[i].return_offset; + shader->blend_types[i] = shader->info.bifrost.blend[i].type; + } #endif - _mesa_hash_table_insert(dev->blitter.shaders.blit, &shader->key, shader); + _mesa_hash_table_insert(dev->blitter.shaders.blit, &shader->key, shader); out: - pthread_mutex_unlock(&dev->blitter.shaders.lock); - return shader; + pthread_mutex_unlock(&dev->blitter.shaders.lock); + return shader; } static struct pan_blit_shader_key pan_blitter_get_key(struct pan_blitter_views *views) { - struct pan_blit_shader_key key = { 0 }; + struct pan_blit_shader_key key = {0}; - if (views->src_z) { - assert(views->dst_z); - key.surfaces[0].loc = FRAG_RESULT_DEPTH; - key.surfaces[0].type = nir_type_float32; - key.surfaces[0].src_samples = views->src_z->image->layout.nr_samples; - key.surfaces[0].dst_samples = views->dst_z->image->layout.nr_samples; - key.surfaces[0].dim = views->src_z->dim; - key.surfaces[0].array = views->src_z->first_layer != views->src_z->last_layer; - } + if (views->src_z) { + assert(views->dst_z); + key.surfaces[0].loc = FRAG_RESULT_DEPTH; + key.surfaces[0].type = nir_type_float32; + key.surfaces[0].src_samples = views->src_z->image->layout.nr_samples; + key.surfaces[0].dst_samples = views->dst_z->image->layout.nr_samples; + key.surfaces[0].dim = views->src_z->dim; + key.surfaces[0].array = + views->src_z->first_layer != views->src_z->last_layer; + } - if (views->src_s) { - assert(views->dst_s); - key.surfaces[1].loc = FRAG_RESULT_STENCIL; - key.surfaces[1].type = nir_type_uint32; - key.surfaces[1].src_samples = views->src_s->image->layout.nr_samples; - key.surfaces[1].dst_samples = views->dst_s->image->layout.nr_samples; - key.surfaces[1].dim = views->src_s->dim; - key.surfaces[1].array = views->src_s->first_layer != views->src_s->last_layer; - } + if (views->src_s) { + assert(views->dst_s); + key.surfaces[1].loc = FRAG_RESULT_STENCIL; + key.surfaces[1].type = nir_type_uint32; + key.surfaces[1].src_samples = views->src_s->image->layout.nr_samples; + key.surfaces[1].dst_samples = views->dst_s->image->layout.nr_samples; + key.surfaces[1].dim = views->src_s->dim; + key.surfaces[1].array = + views->src_s->first_layer != views->src_s->last_layer; + } - for (unsigned i = 0; i < views->rt_count; i++) { - if (!views->src_rts[i]) - continue; + for (unsigned i = 0; i < views->rt_count; i++) { + if (!views->src_rts[i]) + continue; - assert(views->dst_rts[i]); - key.surfaces[i].loc = FRAG_RESULT_DATA0 + i; - key.surfaces[i].type = - util_format_is_pure_uint(views->src_rts[i]->format) ? nir_type_uint32 : - util_format_is_pure_sint(views->src_rts[i]->format) ? nir_type_int32 : - nir_type_float32; - key.surfaces[i].src_samples = views->src_rts[i]->image->layout.nr_samples; - key.surfaces[i].dst_samples = views->dst_rts[i]->image->layout.nr_samples; - key.surfaces[i].dim = views->src_rts[i]->dim; - key.surfaces[i].array = views->src_rts[i]->first_layer != views->src_rts[i]->last_layer; - } + assert(views->dst_rts[i]); + key.surfaces[i].loc = FRAG_RESULT_DATA0 + i; + key.surfaces[i].type = + util_format_is_pure_uint(views->src_rts[i]->format) ? nir_type_uint32 + : util_format_is_pure_sint(views->src_rts[i]->format) + ? nir_type_int32 + : nir_type_float32; + key.surfaces[i].src_samples = views->src_rts[i]->image->layout.nr_samples; + key.surfaces[i].dst_samples = views->dst_rts[i]->image->layout.nr_samples; + key.surfaces[i].dim = views->src_rts[i]->dim; + key.surfaces[i].array = + views->src_rts[i]->first_layer != views->src_rts[i]->last_layer; + } - return key; + return key; } #if PAN_ARCH <= 7 @@ -680,77 +683,75 @@ static mali_ptr pan_blitter_get_rsd(struct panfrost_device *dev, struct pan_blitter_views *views) { - struct pan_blit_rsd_key rsd_key = { 0 }; + struct pan_blit_rsd_key rsd_key = {0}; - assert(!views->rt_count || (!views->src_z && !views->src_s)); + assert(!views->rt_count || (!views->src_z && !views->src_s)); - struct pan_blit_shader_key blit_key = pan_blitter_get_key(views); + struct pan_blit_shader_key blit_key = pan_blitter_get_key(views); - if (views->src_z) { - assert(views->dst_z); - rsd_key.z.format = views->dst_z->format; - rsd_key.z.type = blit_key.surfaces[0].type; - rsd_key.z.src_samples = blit_key.surfaces[0].src_samples; - rsd_key.z.dst_samples = blit_key.surfaces[0].dst_samples; - rsd_key.z.dim = blit_key.surfaces[0].dim; - rsd_key.z.array = blit_key.surfaces[0].array; - } + if (views->src_z) { + assert(views->dst_z); + rsd_key.z.format = views->dst_z->format; + rsd_key.z.type = blit_key.surfaces[0].type; + rsd_key.z.src_samples = blit_key.surfaces[0].src_samples; + rsd_key.z.dst_samples = blit_key.surfaces[0].dst_samples; + rsd_key.z.dim = blit_key.surfaces[0].dim; + rsd_key.z.array = blit_key.surfaces[0].array; + } - if (views->src_s) { - assert(views->dst_s); - rsd_key.s.format = views->dst_s->format; - rsd_key.s.type = blit_key.surfaces[1].type; - rsd_key.s.src_samples = blit_key.surfaces[1].src_samples; - rsd_key.s.dst_samples = blit_key.surfaces[1].dst_samples; - rsd_key.s.dim = blit_key.surfaces[1].dim; - rsd_key.s.array = blit_key.surfaces[1].array; - } + if (views->src_s) { + assert(views->dst_s); + rsd_key.s.format = views->dst_s->format; + rsd_key.s.type = blit_key.surfaces[1].type; + rsd_key.s.src_samples = blit_key.surfaces[1].src_samples; + rsd_key.s.dst_samples = blit_key.surfaces[1].dst_samples; + rsd_key.s.dim = blit_key.surfaces[1].dim; + rsd_key.s.array = blit_key.surfaces[1].array; + } - for (unsigned i = 0; i < views->rt_count; i++) { - if (!views->src_rts[i]) - continue; + for (unsigned i = 0; i < views->rt_count; i++) { + if (!views->src_rts[i]) + continue; - assert(views->dst_rts[i]); - rsd_key.rts[i].format = views->dst_rts[i]->format; - rsd_key.rts[i].type = blit_key.surfaces[i].type; - rsd_key.rts[i].src_samples = blit_key.surfaces[i].src_samples; - rsd_key.rts[i].dst_samples = blit_key.surfaces[i].dst_samples; - rsd_key.rts[i].dim = blit_key.surfaces[i].dim; - rsd_key.rts[i].array = blit_key.surfaces[i].array; - } + assert(views->dst_rts[i]); + rsd_key.rts[i].format = views->dst_rts[i]->format; + rsd_key.rts[i].type = blit_key.surfaces[i].type; + rsd_key.rts[i].src_samples = blit_key.surfaces[i].src_samples; + rsd_key.rts[i].dst_samples = blit_key.surfaces[i].dst_samples; + rsd_key.rts[i].dim = blit_key.surfaces[i].dim; + rsd_key.rts[i].array = blit_key.surfaces[i].array; + } - pthread_mutex_lock(&dev->blitter.rsds.lock); - struct hash_entry *he = - _mesa_hash_table_search(dev->blitter.rsds.rsds, &rsd_key); - struct pan_blit_rsd_data *rsd = he ? he->data : NULL; - if (rsd) - goto out; + pthread_mutex_lock(&dev->blitter.rsds.lock); + struct hash_entry *he = + _mesa_hash_table_search(dev->blitter.rsds.rsds, &rsd_key); + struct pan_blit_rsd_data *rsd = he ? he->data : NULL; + if (rsd) + goto out; - rsd = rzalloc(dev->blitter.rsds.rsds, struct pan_blit_rsd_data); - rsd->key = rsd_key; + rsd = rzalloc(dev->blitter.rsds.rsds, struct pan_blit_rsd_data); + rsd->key = rsd_key; - unsigned bd_count = PAN_ARCH >= 5 ? MAX2(views->rt_count, 1) : 0; - struct panfrost_ptr rsd_ptr = - pan_pool_alloc_desc_aggregate(dev->blitter.rsds.pool, - PAN_DESC(RENDERER_STATE), - PAN_DESC_ARRAY(bd_count, BLEND)); + unsigned bd_count = PAN_ARCH >= 5 ? MAX2(views->rt_count, 1) : 0; + struct panfrost_ptr rsd_ptr = pan_pool_alloc_desc_aggregate( + dev->blitter.rsds.pool, PAN_DESC(RENDERER_STATE), + PAN_DESC_ARRAY(bd_count, BLEND)); - mali_ptr blend_shaders[8] = { 0 }; + mali_ptr blend_shaders[8] = {0}; - const struct pan_blit_shader_data *blit_shader = - pan_blitter_get_blit_shader(dev, &blit_key); + const struct pan_blit_shader_data *blit_shader = + pan_blitter_get_blit_shader(dev, &blit_key); - pan_blitter_get_blend_shaders(dev, views->rt_count, views->dst_rts, - blit_shader, blend_shaders); + pan_blitter_get_blend_shaders(dev, views->rt_count, views->dst_rts, + blit_shader, blend_shaders); - pan_blitter_emit_rsd(dev, blit_shader, views, blend_shaders, - rsd_ptr.cpu); - rsd->address = rsd_ptr.gpu; - _mesa_hash_table_insert(dev->blitter.rsds.rsds, &rsd->key, rsd); + pan_blitter_emit_rsd(dev, blit_shader, views, blend_shaders, rsd_ptr.cpu); + rsd->address = rsd_ptr.gpu; + _mesa_hash_table_insert(dev->blitter.rsds.rsds, &rsd->key, rsd); out: - pthread_mutex_unlock(&dev->blitter.rsds.lock); - return rsd->address; + pthread_mutex_unlock(&dev->blitter.rsds.lock); + return rsd->address; } static mali_ptr @@ -758,246 +759,253 @@ pan_blit_get_rsd(struct panfrost_device *dev, const struct pan_image_view *src_views, const struct pan_image_view *dst_view) { - const struct util_format_description *desc = - util_format_description(src_views[0].format); + const struct util_format_description *desc = + util_format_description(src_views[0].format); - struct pan_blitter_views views = { }; + struct pan_blitter_views views = {}; - if (util_format_has_depth(desc)) { - views.src_z = &src_views[0]; - views.dst_z = dst_view; - } + if (util_format_has_depth(desc)) { + views.src_z = &src_views[0]; + views.dst_z = dst_view; + } - if (src_views[1].format) { - views.src_s = &src_views[1]; - views.dst_s = dst_view; - } else if (util_format_has_stencil(desc)) { - views.src_s = &src_views[0]; - views.dst_s = dst_view; - } + if (src_views[1].format) { + views.src_s = &src_views[1]; + views.dst_s = dst_view; + } else if (util_format_has_stencil(desc)) { + views.src_s = &src_views[0]; + views.dst_s = dst_view; + } - if (!views.src_z && !views.src_s) { - views.rt_count = 1; - views.src_rts[0] = src_views; - views.dst_rts[0] = dst_view; - } + if (!views.src_z && !views.src_s) { + views.rt_count = 1; + views.src_rts[0] = src_views; + views.dst_rts[0] = dst_view; + } - return pan_blitter_get_rsd(dev, &views); + return pan_blitter_get_rsd(dev, &views); } #endif static struct pan_blitter_views -pan_preload_get_views(const struct pan_fb_info *fb, bool zs, struct pan_image_view *patched_s) +pan_preload_get_views(const struct pan_fb_info *fb, bool zs, + struct pan_image_view *patched_s) { - struct pan_blitter_views views = { 0 }; + struct pan_blitter_views views = {0}; - if (zs) { - if (fb->zs.preload.z) - views.src_z = views.dst_z = fb->zs.view.zs; + if (zs) { + if (fb->zs.preload.z) + views.src_z = views.dst_z = fb->zs.view.zs; - if (fb->zs.preload.s) { - const struct pan_image_view *view = fb->zs.view.s ? : fb->zs.view.zs; - enum pipe_format fmt = util_format_get_depth_only(view->format); + if (fb->zs.preload.s) { + const struct pan_image_view *view = fb->zs.view.s ?: fb->zs.view.zs; + enum pipe_format fmt = util_format_get_depth_only(view->format); - switch (view->format) { - case PIPE_FORMAT_Z24_UNORM_S8_UINT: fmt = PIPE_FORMAT_X24S8_UINT; break; - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: fmt = PIPE_FORMAT_X32_S8X24_UINT; break; - default: fmt = view->format; break; - } + switch (view->format) { + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + fmt = PIPE_FORMAT_X24S8_UINT; + break; + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + fmt = PIPE_FORMAT_X32_S8X24_UINT; + break; + default: + fmt = view->format; + break; + } - if (fmt != view->format) { - *patched_s = *view; - patched_s->format = fmt; - views.src_s = views.dst_s = patched_s; - } else { - views.src_s = views.dst_s = view; - } - } - } else { - for (unsigned i = 0; i < fb->rt_count; i++) { - if (fb->rts[i].preload) { - views.src_rts[i] = fb->rts[i].view; - views.dst_rts[i] = fb->rts[i].view; - } - } + if (fmt != view->format) { + *patched_s = *view; + patched_s->format = fmt; + views.src_s = views.dst_s = patched_s; + } else { + views.src_s = views.dst_s = view; + } + } + } else { + for (unsigned i = 0; i < fb->rt_count; i++) { + if (fb->rts[i].preload) { + views.src_rts[i] = fb->rts[i].view; + views.dst_rts[i] = fb->rts[i].view; + } + } - views.rt_count = fb->rt_count; - } + views.rt_count = fb->rt_count; + } - return views; + return views; } static bool pan_preload_needed(const struct pan_fb_info *fb, bool zs) { - if (zs) { - if (fb->zs.preload.z || fb->zs.preload.s) - return true; - } else { - for (unsigned i = 0; i < fb->rt_count; i++) { - if (fb->rts[i].preload) - return true; - } - } + if (zs) { + if (fb->zs.preload.z || fb->zs.preload.s) + return true; + } else { + for (unsigned i = 0; i < fb->rt_count; i++) { + if (fb->rts[i].preload) + return true; + } + } - return false; + return false; } static mali_ptr pan_blitter_emit_varying(struct pan_pool *pool) { - struct panfrost_ptr varying = pan_pool_alloc_desc(pool, ATTRIBUTE); + struct panfrost_ptr varying = pan_pool_alloc_desc(pool, ATTRIBUTE); - pan_pack(varying.cpu, ATTRIBUTE, cfg) { - cfg.buffer_index = 0; - cfg.offset_enable = PAN_ARCH <= 5; - cfg.format = pool->dev->formats[PIPE_FORMAT_R32G32B32_FLOAT].hw; + pan_pack(varying.cpu, ATTRIBUTE, cfg) { + cfg.buffer_index = 0; + cfg.offset_enable = PAN_ARCH <= 5; + cfg.format = pool->dev->formats[PIPE_FORMAT_R32G32B32_FLOAT].hw; #if PAN_ARCH >= 9 - cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D; - cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER; - cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX; - cfg.stride = 4 * sizeof(float); + cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D; + cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER; + cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX; + cfg.stride = 4 * sizeof(float); #endif - } + } - return varying.gpu; + return varying.gpu; } static mali_ptr pan_blitter_emit_varying_buffer(struct pan_pool *pool, mali_ptr coordinates) { #if PAN_ARCH >= 9 - struct panfrost_ptr varying_buffer = pan_pool_alloc_desc(pool, BUFFER); + struct panfrost_ptr varying_buffer = pan_pool_alloc_desc(pool, BUFFER); - pan_pack(varying_buffer.cpu, BUFFER, cfg) { - cfg.address = coordinates; - cfg.size = 4 * sizeof(float) * 4; - } + pan_pack(varying_buffer.cpu, BUFFER, cfg) { + cfg.address = coordinates; + cfg.size = 4 * sizeof(float) * 4; + } #else - /* Bifrost needs an empty desc to mark end of prefetching */ - bool padding_buffer = PAN_ARCH >= 6; + /* Bifrost needs an empty desc to mark end of prefetching */ + bool padding_buffer = PAN_ARCH >= 6; - struct panfrost_ptr varying_buffer = - pan_pool_alloc_desc_array(pool, (padding_buffer ? 2 : 1), - ATTRIBUTE_BUFFER); + struct panfrost_ptr varying_buffer = pan_pool_alloc_desc_array( + pool, (padding_buffer ? 2 : 1), ATTRIBUTE_BUFFER); - pan_pack(varying_buffer.cpu, ATTRIBUTE_BUFFER, cfg) { - cfg.pointer = coordinates; - cfg.stride = 4 * sizeof(float); - cfg.size = cfg.stride * 4; - } + pan_pack(varying_buffer.cpu, ATTRIBUTE_BUFFER, cfg) { + cfg.pointer = coordinates; + cfg.stride = 4 * sizeof(float); + cfg.size = cfg.stride * 4; + } - if (padding_buffer) { - pan_pack(varying_buffer.cpu + pan_size(ATTRIBUTE_BUFFER), - ATTRIBUTE_BUFFER, cfg); - } + if (padding_buffer) { + pan_pack(varying_buffer.cpu + pan_size(ATTRIBUTE_BUFFER), + ATTRIBUTE_BUFFER, cfg) + ; + } #endif - return varying_buffer.gpu; + return varying_buffer.gpu; } static mali_ptr -pan_blitter_emit_sampler(struct pan_pool *pool, - bool nearest_filter) +pan_blitter_emit_sampler(struct pan_pool *pool, bool nearest_filter) { - struct panfrost_ptr sampler = - pan_pool_alloc_desc(pool, SAMPLER); + struct panfrost_ptr sampler = pan_pool_alloc_desc(pool, SAMPLER); - pan_pack(sampler.cpu, SAMPLER, cfg) { - cfg.seamless_cube_map = false; - cfg.normalized_coordinates = false; - cfg.minify_nearest = nearest_filter; - cfg.magnify_nearest = nearest_filter; - } + pan_pack(sampler.cpu, SAMPLER, cfg) { + cfg.seamless_cube_map = false; + cfg.normalized_coordinates = false; + cfg.minify_nearest = nearest_filter; + cfg.magnify_nearest = nearest_filter; + } - return sampler.gpu; + return sampler.gpu; } static mali_ptr -pan_blitter_emit_textures(struct pan_pool *pool, - unsigned tex_count, +pan_blitter_emit_textures(struct pan_pool *pool, unsigned tex_count, const struct pan_image_view **views) { #if PAN_ARCH >= 6 - struct panfrost_ptr textures = - pan_pool_alloc_desc_array(pool, tex_count, TEXTURE); + struct panfrost_ptr textures = + pan_pool_alloc_desc_array(pool, tex_count, TEXTURE); - for (unsigned i = 0; i < tex_count; i++) { - void *texture = textures.cpu + (pan_size(TEXTURE) * i); - size_t payload_size = - GENX(panfrost_estimate_texture_payload_size)(views[i]); - struct panfrost_ptr surfaces = - pan_pool_alloc_aligned(pool, payload_size, 64); + for (unsigned i = 0; i < tex_count; i++) { + void *texture = textures.cpu + (pan_size(TEXTURE) * i); + size_t payload_size = + GENX(panfrost_estimate_texture_payload_size)(views[i]); + struct panfrost_ptr surfaces = + pan_pool_alloc_aligned(pool, payload_size, 64); - GENX(panfrost_new_texture)(pool->dev, views[i], texture, &surfaces); - } + GENX(panfrost_new_texture)(pool->dev, views[i], texture, &surfaces); + } - return textures.gpu; + return textures.gpu; #else - mali_ptr textures[8] = { 0 }; + mali_ptr textures[8] = {0}; - for (unsigned i = 0; i < tex_count; i++) { - size_t sz = pan_size(TEXTURE) + - GENX(panfrost_estimate_texture_payload_size)(views[i]); - struct panfrost_ptr texture = - pan_pool_alloc_aligned(pool, sz, pan_alignment(TEXTURE)); - struct panfrost_ptr surfaces = { - .cpu = texture.cpu + pan_size(TEXTURE), - .gpu = texture.gpu + pan_size(TEXTURE), - }; + for (unsigned i = 0; i < tex_count; i++) { + size_t sz = pan_size(TEXTURE) + + GENX(panfrost_estimate_texture_payload_size)(views[i]); + struct panfrost_ptr texture = + pan_pool_alloc_aligned(pool, sz, pan_alignment(TEXTURE)); + struct panfrost_ptr surfaces = { + .cpu = texture.cpu + pan_size(TEXTURE), + .gpu = texture.gpu + pan_size(TEXTURE), + }; - GENX(panfrost_new_texture)(pool->dev, views[i], texture.cpu, &surfaces); - textures[i] = texture.gpu; - } + GENX(panfrost_new_texture)(pool->dev, views[i], texture.cpu, &surfaces); + textures[i] = texture.gpu; + } - return pan_pool_upload_aligned(pool, textures, - tex_count * sizeof(mali_ptr), - sizeof(mali_ptr)); + return pan_pool_upload_aligned(pool, textures, tex_count * sizeof(mali_ptr), + sizeof(mali_ptr)); #endif } static mali_ptr -pan_preload_emit_textures(struct pan_pool *pool, - const struct pan_fb_info *fb, bool zs, - unsigned *tex_count_out) +pan_preload_emit_textures(struct pan_pool *pool, const struct pan_fb_info *fb, + bool zs, unsigned *tex_count_out) { - const struct pan_image_view *views[8]; - struct pan_image_view patched_s_view; - unsigned tex_count = 0; + const struct pan_image_view *views[8]; + struct pan_image_view patched_s_view; + unsigned tex_count = 0; - if (zs) { - if (fb->zs.preload.z) - views[tex_count++] = fb->zs.view.zs; + if (zs) { + if (fb->zs.preload.z) + views[tex_count++] = fb->zs.view.zs; - if (fb->zs.preload.s) { - const struct pan_image_view *view = fb->zs.view.s ? : fb->zs.view.zs; - enum pipe_format fmt = util_format_get_depth_only(view->format); + if (fb->zs.preload.s) { + const struct pan_image_view *view = fb->zs.view.s ?: fb->zs.view.zs; + enum pipe_format fmt = util_format_get_depth_only(view->format); - switch (view->format) { - case PIPE_FORMAT_Z24_UNORM_S8_UINT: fmt = PIPE_FORMAT_X24S8_UINT; break; - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: fmt = PIPE_FORMAT_X32_S8X24_UINT; break; - default: fmt = view->format; break; - } + switch (view->format) { + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + fmt = PIPE_FORMAT_X24S8_UINT; + break; + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + fmt = PIPE_FORMAT_X32_S8X24_UINT; + break; + default: + fmt = view->format; + break; + } - if (fmt != view->format) { - patched_s_view = *view; - patched_s_view.format = fmt; - view = &patched_s_view; - } - views[tex_count++] = view; - } - } else { - for (unsigned i = 0; i < fb->rt_count; i++) { - if (fb->rts[i].preload) - views[tex_count++] = fb->rts[i].view; - } + if (fmt != view->format) { + patched_s_view = *view; + patched_s_view.format = fmt; + view = &patched_s_view; + } + views[tex_count++] = view; + } + } else { + for (unsigned i = 0; i < fb->rt_count; i++) { + if (fb->rts[i].preload) + views[tex_count++] = fb->rts[i].view; + } + } - } + *tex_count_out = tex_count; - *tex_count_out = tex_count; - - return pan_blitter_emit_textures(pool, tex_count, views); + return pan_blitter_emit_textures(pool, tex_count, views); } #if PAN_ARCH >= 8 @@ -1005,214 +1013,212 @@ pan_preload_emit_textures(struct pan_pool *pool, static mali_ptr pan_blitter_emit_zs(struct pan_pool *pool, bool z, bool s) { - struct panfrost_ptr zsd = pan_pool_alloc_desc(pool, DEPTH_STENCIL); + struct panfrost_ptr zsd = pan_pool_alloc_desc(pool, DEPTH_STENCIL); - pan_pack(zsd.cpu, DEPTH_STENCIL, cfg) { - cfg.depth_function = MALI_FUNC_ALWAYS; - cfg.depth_write_enable = z; + pan_pack(zsd.cpu, DEPTH_STENCIL, cfg) { + cfg.depth_function = MALI_FUNC_ALWAYS; + cfg.depth_write_enable = z; - if (z) - cfg.depth_source = MALI_DEPTH_SOURCE_SHADER; + if (z) + cfg.depth_source = MALI_DEPTH_SOURCE_SHADER; - cfg.stencil_test_enable = s; - cfg.stencil_from_shader = s; + cfg.stencil_test_enable = s; + cfg.stencil_from_shader = s; - cfg.front_compare_function = MALI_FUNC_ALWAYS; - cfg.front_stencil_fail = MALI_STENCIL_OP_REPLACE; - cfg.front_depth_fail = MALI_STENCIL_OP_REPLACE; - cfg.front_depth_pass = MALI_STENCIL_OP_REPLACE; - cfg.front_write_mask = 0xFF; - cfg.front_value_mask = 0xFF; + cfg.front_compare_function = MALI_FUNC_ALWAYS; + cfg.front_stencil_fail = MALI_STENCIL_OP_REPLACE; + cfg.front_depth_fail = MALI_STENCIL_OP_REPLACE; + cfg.front_depth_pass = MALI_STENCIL_OP_REPLACE; + cfg.front_write_mask = 0xFF; + cfg.front_value_mask = 0xFF; - cfg.back_compare_function = MALI_FUNC_ALWAYS; - cfg.back_stencil_fail = MALI_STENCIL_OP_REPLACE; - cfg.back_depth_fail = MALI_STENCIL_OP_REPLACE; - cfg.back_depth_pass = MALI_STENCIL_OP_REPLACE; - cfg.back_write_mask = 0xFF; - cfg.back_value_mask = 0xFF; + cfg.back_compare_function = MALI_FUNC_ALWAYS; + cfg.back_stencil_fail = MALI_STENCIL_OP_REPLACE; + cfg.back_depth_fail = MALI_STENCIL_OP_REPLACE; + cfg.back_depth_pass = MALI_STENCIL_OP_REPLACE; + cfg.back_write_mask = 0xFF; + cfg.back_value_mask = 0xFF; - cfg.depth_cull_enable = false; - } + cfg.depth_cull_enable = false; + } - return zsd.gpu; + return zsd.gpu; } #else static mali_ptr -pan_blitter_emit_viewport(struct pan_pool *pool, - uint16_t minx, uint16_t miny, +pan_blitter_emit_viewport(struct pan_pool *pool, uint16_t minx, uint16_t miny, uint16_t maxx, uint16_t maxy) { - struct panfrost_ptr vp = pan_pool_alloc_desc(pool, VIEWPORT); + struct panfrost_ptr vp = pan_pool_alloc_desc(pool, VIEWPORT); - pan_pack(vp.cpu, VIEWPORT, cfg) { - cfg.scissor_minimum_x = minx; - cfg.scissor_minimum_y = miny; - cfg.scissor_maximum_x = maxx; - cfg.scissor_maximum_y = maxy; - } + pan_pack(vp.cpu, VIEWPORT, cfg) { + cfg.scissor_minimum_x = minx; + cfg.scissor_minimum_y = miny; + cfg.scissor_maximum_x = maxx; + cfg.scissor_maximum_y = maxy; + } - return vp.gpu; + return vp.gpu; } #endif static void -pan_preload_emit_dcd(struct pan_pool *pool, - struct pan_fb_info *fb, bool zs, - mali_ptr coordinates, - mali_ptr tsd, void *out, bool always_write) +pan_preload_emit_dcd(struct pan_pool *pool, struct pan_fb_info *fb, bool zs, + mali_ptr coordinates, mali_ptr tsd, void *out, + bool always_write) { - unsigned tex_count = 0; - mali_ptr textures = pan_preload_emit_textures(pool, fb, zs, &tex_count); - mali_ptr samplers = pan_blitter_emit_sampler(pool, true); - mali_ptr varyings = pan_blitter_emit_varying(pool); - mali_ptr varying_buffers = pan_blitter_emit_varying_buffer(pool, coordinates); + unsigned tex_count = 0; + mali_ptr textures = pan_preload_emit_textures(pool, fb, zs, &tex_count); + mali_ptr samplers = pan_blitter_emit_sampler(pool, true); + mali_ptr varyings = pan_blitter_emit_varying(pool); + mali_ptr varying_buffers = + pan_blitter_emit_varying_buffer(pool, coordinates); - /* Tiles updated by blit shaders are still considered clean (separate - * for colour and Z/S), allowing us to suppress unnecessary writeback - */ - UNUSED bool clean_fragment_write = !always_write; + /* Tiles updated by blit shaders are still considered clean (separate + * for colour and Z/S), allowing us to suppress unnecessary writeback + */ + UNUSED bool clean_fragment_write = !always_write; - /* Image view used when patching stencil formats for combined - * depth/stencil preloads. - */ - struct pan_image_view patched_s; + /* Image view used when patching stencil formats for combined + * depth/stencil preloads. + */ + struct pan_image_view patched_s; - struct pan_blitter_views views = pan_preload_get_views(fb, zs, &patched_s); + struct pan_blitter_views views = pan_preload_get_views(fb, zs, &patched_s); #if PAN_ARCH <= 7 - pan_pack(out, DRAW, cfg) { - uint16_t minx = 0, miny = 0, maxx, maxy; + pan_pack(out, DRAW, cfg) { + uint16_t minx = 0, miny = 0, maxx, maxy; - if (PAN_ARCH == 4) { - maxx = fb->width - 1; - maxy = fb->height - 1; - } else { - /* Align on 32x32 tiles */ - minx = fb->extent.minx & ~31; - miny = fb->extent.miny & ~31; - maxx = MIN2(ALIGN_POT(fb->extent.maxx + 1, 32), fb->width) - 1; - maxy = MIN2(ALIGN_POT(fb->extent.maxy + 1, 32), fb->height) - 1; - } + if (PAN_ARCH == 4) { + maxx = fb->width - 1; + maxy = fb->height - 1; + } else { + /* Align on 32x32 tiles */ + minx = fb->extent.minx & ~31; + miny = fb->extent.miny & ~31; + maxx = MIN2(ALIGN_POT(fb->extent.maxx + 1, 32), fb->width) - 1; + maxy = MIN2(ALIGN_POT(fb->extent.maxy + 1, 32), fb->height) - 1; + } - cfg.thread_storage = tsd; - cfg.state = pan_blitter_get_rsd(pool->dev, &views); + cfg.thread_storage = tsd; + cfg.state = pan_blitter_get_rsd(pool->dev, &views); - cfg.position = coordinates; - cfg.viewport = - pan_blitter_emit_viewport(pool, minx, miny, maxx, maxy); + cfg.position = coordinates; + cfg.viewport = pan_blitter_emit_viewport(pool, minx, miny, maxx, maxy); - cfg.varyings = varyings; - cfg.varying_buffers = varying_buffers; - cfg.textures = textures; - cfg.samplers = samplers; + cfg.varyings = varyings; + cfg.varying_buffers = varying_buffers; + cfg.textures = textures; + cfg.samplers = samplers; #if PAN_ARCH >= 6 - cfg.clean_fragment_write = clean_fragment_write; + cfg.clean_fragment_write = clean_fragment_write; #endif - } + } #else - struct panfrost_ptr T; - unsigned nr_tables = 12; + struct panfrost_ptr T; + unsigned nr_tables = 12; - /* Although individual resources need only 16 byte alignment, the - * resource table as a whole must be 64-byte aligned. - */ - T = pan_pool_alloc_aligned(pool, nr_tables * pan_size(RESOURCE), 64); - memset(T.cpu, 0, nr_tables * pan_size(RESOURCE)); + /* Although individual resources need only 16 byte alignment, the + * resource table as a whole must be 64-byte aligned. + */ + T = pan_pool_alloc_aligned(pool, nr_tables * pan_size(RESOURCE), 64); + memset(T.cpu, 0, nr_tables * pan_size(RESOURCE)); - panfrost_make_resource_table(T, PAN_TABLE_TEXTURE, textures, tex_count); - panfrost_make_resource_table(T, PAN_TABLE_SAMPLER, samplers, 1); - panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE, varyings, 1); - panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE_BUFFER, varying_buffers, 1); + panfrost_make_resource_table(T, PAN_TABLE_TEXTURE, textures, tex_count); + panfrost_make_resource_table(T, PAN_TABLE_SAMPLER, samplers, 1); + panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE, varyings, 1); + panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE_BUFFER, varying_buffers, + 1); - struct pan_blit_shader_key key = pan_blitter_get_key(&views); - const struct pan_blit_shader_data *blit_shader = - pan_blitter_get_blit_shader(pool->dev, &key); + struct pan_blit_shader_key key = pan_blitter_get_key(&views); + const struct pan_blit_shader_data *blit_shader = + pan_blitter_get_blit_shader(pool->dev, &key); - bool z = fb->zs.preload.z; - bool s = fb->zs.preload.s; - bool ms = pan_blitter_is_ms(&views); + bool z = fb->zs.preload.z; + bool s = fb->zs.preload.s; + bool ms = pan_blitter_is_ms(&views); - struct panfrost_ptr spd = pan_pool_alloc_desc(pool, SHADER_PROGRAM); - pan_pack(spd.cpu, SHADER_PROGRAM, cfg) { - cfg.stage = MALI_SHADER_STAGE_FRAGMENT; - cfg.primary_shader = true; - cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; - cfg.binary = blit_shader->address; - cfg.preload.r48_r63 = blit_shader->info.preload >> 48; - } + struct panfrost_ptr spd = pan_pool_alloc_desc(pool, SHADER_PROGRAM); + pan_pack(spd.cpu, SHADER_PROGRAM, cfg) { + cfg.stage = MALI_SHADER_STAGE_FRAGMENT; + cfg.primary_shader = true; + cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; + cfg.binary = blit_shader->address; + cfg.preload.r48_r63 = blit_shader->info.preload >> 48; + } - unsigned bd_count = views.rt_count; - struct panfrost_ptr blend = pan_pool_alloc_desc_array(pool, bd_count, BLEND); + unsigned bd_count = views.rt_count; + struct panfrost_ptr blend = pan_pool_alloc_desc_array(pool, bd_count, BLEND); - if (!zs) { - pan_blitter_emit_blends(pool->dev, blit_shader, &views, NULL, - blend.cpu); - } + if (!zs) { + pan_blitter_emit_blends(pool->dev, blit_shader, &views, NULL, blend.cpu); + } - pan_pack(out, DRAW, cfg) { - if (zs) { - /* ZS_EMIT requires late update/kill */ - cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; - cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE; - cfg.blend_count = 0; - } else { - /* Skipping ATEST requires forcing Z/S */ - cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; - cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; + pan_pack(out, DRAW, cfg) { + if (zs) { + /* ZS_EMIT requires late update/kill */ + cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; + cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE; + cfg.blend_count = 0; + } else { + /* Skipping ATEST requires forcing Z/S */ + cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; + cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; - cfg.blend = blend.gpu; - cfg.blend_count = bd_count; - cfg.render_target_mask = 0x1; - } + cfg.blend = blend.gpu; + cfg.blend_count = bd_count; + cfg.render_target_mask = 0x1; + } - cfg.allow_forward_pixel_to_kill = !zs; - cfg.allow_forward_pixel_to_be_killed = true; - cfg.depth_stencil = pan_blitter_emit_zs(pool, z, s); - cfg.sample_mask = 0xFFFF; - cfg.multisample_enable = ms; - cfg.evaluate_per_sample = ms; - cfg.maximum_z = 1.0; - cfg.clean_fragment_write = clean_fragment_write; - cfg.shader.resources = T.gpu | nr_tables; - cfg.shader.shader = spd.gpu; - cfg.shader.thread_storage = tsd; - } + cfg.allow_forward_pixel_to_kill = !zs; + cfg.allow_forward_pixel_to_be_killed = true; + cfg.depth_stencil = pan_blitter_emit_zs(pool, z, s); + cfg.sample_mask = 0xFFFF; + cfg.multisample_enable = ms; + cfg.evaluate_per_sample = ms; + cfg.maximum_z = 1.0; + cfg.clean_fragment_write = clean_fragment_write; + cfg.shader.resources = T.gpu | nr_tables; + cfg.shader.shader = spd.gpu; + cfg.shader.thread_storage = tsd; + } #endif } #if PAN_ARCH <= 7 static void * pan_blit_emit_tiler_job(struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - mali_ptr tiler, + struct pan_scoreboard *scoreboard, mali_ptr tiler, struct panfrost_ptr *job) { - *job = pan_pool_alloc_desc(pool, TILER_JOB); + *job = pan_pool_alloc_desc(pool, TILER_JOB); - pan_section_pack(job->cpu, TILER_JOB, PRIMITIVE, cfg) { - cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP; - cfg.index_count = 4; - cfg.job_task_split = 6; - } + pan_section_pack(job->cpu, TILER_JOB, PRIMITIVE, cfg) { + cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP; + cfg.index_count = 4; + cfg.job_task_split = 6; + } - pan_section_pack(job->cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) { - cfg.constant = 1.0f; - } + pan_section_pack(job->cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) { + cfg.constant = 1.0f; + } - void *invoc = pan_section_ptr(job->cpu, TILER_JOB, INVOCATION); - panfrost_pack_work_groups_compute(invoc, 1, 4, 1, 1, 1, 1, true, false); + void *invoc = pan_section_ptr(job->cpu, TILER_JOB, INVOCATION); + panfrost_pack_work_groups_compute(invoc, 1, 4, 1, 1, 1, 1, true, false); #if PAN_ARCH >= 6 - pan_section_pack(job->cpu, TILER_JOB, PADDING, cfg); - pan_section_pack(job->cpu, TILER_JOB, TILER, cfg) { - cfg.address = tiler; - } + pan_section_pack(job->cpu, TILER_JOB, PADDING, cfg) + ; + pan_section_pack(job->cpu, TILER_JOB, TILER, cfg) { + cfg.address = tiler; + } #endif - panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_TILER, - false, false, 0, 0, job, false); - return pan_section_ptr(job->cpu, TILER_JOB, DRAW); + panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_TILER, false, false, 0, 0, + job, false); + return pan_section_ptr(job->cpu, TILER_JOB, DRAW); } #endif @@ -1221,172 +1227,157 @@ static void pan_preload_fb_alloc_pre_post_dcds(struct pan_pool *desc_pool, struct pan_fb_info *fb) { - if (fb->bifrost.pre_post.dcds.gpu) - return; + if (fb->bifrost.pre_post.dcds.gpu) + return; - fb->bifrost.pre_post.dcds = - pan_pool_alloc_desc_array(desc_pool, 3, DRAW); + fb->bifrost.pre_post.dcds = pan_pool_alloc_desc_array(desc_pool, 3, DRAW); } static void pan_preload_emit_pre_frame_dcd(struct pan_pool *desc_pool, - struct pan_fb_info *fb, bool zs, - mali_ptr coords, mali_ptr tsd) + struct pan_fb_info *fb, bool zs, mali_ptr coords, + mali_ptr tsd) { - unsigned dcd_idx = zs ? 1 : 0; - pan_preload_fb_alloc_pre_post_dcds(desc_pool, fb); - assert(fb->bifrost.pre_post.dcds.cpu); - void *dcd = fb->bifrost.pre_post.dcds.cpu + - (dcd_idx * pan_size(DRAW)); + unsigned dcd_idx = zs ? 1 : 0; + pan_preload_fb_alloc_pre_post_dcds(desc_pool, fb); + assert(fb->bifrost.pre_post.dcds.cpu); + void *dcd = fb->bifrost.pre_post.dcds.cpu + (dcd_idx * pan_size(DRAW)); - /* We only use crc_rt to determine whether to force writes for updating - * the CRCs, so use a conservative tile size (16x16). - */ - int crc_rt = GENX(pan_select_crc_rt)(fb, 16 * 16); + /* We only use crc_rt to determine whether to force writes for updating + * the CRCs, so use a conservative tile size (16x16). + */ + int crc_rt = GENX(pan_select_crc_rt)(fb, 16 * 16); - bool always_write = false; + bool always_write = false; - /* If CRC data is currently invalid and this batch will make it valid, - * write even clean tiles to make sure CRC data is updated. */ - if (crc_rt >= 0) { - bool *valid = fb->rts[crc_rt].crc_valid; - bool full = !fb->extent.minx && !fb->extent.miny && - fb->extent.maxx == (fb->width - 1) && - fb->extent.maxy == (fb->height - 1); + /* If CRC data is currently invalid and this batch will make it valid, + * write even clean tiles to make sure CRC data is updated. */ + if (crc_rt >= 0) { + bool *valid = fb->rts[crc_rt].crc_valid; + bool full = !fb->extent.minx && !fb->extent.miny && + fb->extent.maxx == (fb->width - 1) && + fb->extent.maxy == (fb->height - 1); - if (full && !(*valid)) - always_write = true; - } + if (full && !(*valid)) + always_write = true; + } - pan_preload_emit_dcd(desc_pool, fb, zs, coords, tsd, dcd, always_write); - if (zs) { - enum pipe_format fmt = fb->zs.view.zs ? - fb->zs.view.zs->image->layout.format : - fb->zs.view.s->image->layout.format; - bool always = false; + pan_preload_emit_dcd(desc_pool, fb, zs, coords, tsd, dcd, always_write); + if (zs) { + enum pipe_format fmt = fb->zs.view.zs + ? fb->zs.view.zs->image->layout.format + : fb->zs.view.s->image->layout.format; + bool always = false; - /* If we're dealing with a combined ZS resource and only one - * component is cleared, we need to reload the whole surface - * because the zs_clean_pixel_write_enable flag is set in that - * case. - */ - if (util_format_is_depth_and_stencil(fmt) && - fb->zs.clear.z != fb->zs.clear.s) - always = true; + /* If we're dealing with a combined ZS resource and only one + * component is cleared, we need to reload the whole surface + * because the zs_clean_pixel_write_enable flag is set in that + * case. + */ + if (util_format_is_depth_and_stencil(fmt) && + fb->zs.clear.z != fb->zs.clear.s) + always = true; - /* We could use INTERSECT on Bifrost v7 too, but - * EARLY_ZS_ALWAYS has the advantage of reloading the ZS tile - * buffer one or more tiles ahead, making ZS data immediately - * available for any ZS tests taking place in other shaders. - * Thing's haven't been benchmarked to determine what's - * preferable (saving bandwidth vs having ZS preloaded - * earlier), so let's leave it like that for now. - */ - fb->bifrost.pre_post.modes[dcd_idx] = - desc_pool->dev->arch > 6 ? - MALI_PRE_POST_FRAME_SHADER_MODE_EARLY_ZS_ALWAYS : - always ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS : - MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT; - } else { - fb->bifrost.pre_post.modes[dcd_idx] = - always_write ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS : - MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT; - } + /* We could use INTERSECT on Bifrost v7 too, but + * EARLY_ZS_ALWAYS has the advantage of reloading the ZS tile + * buffer one or more tiles ahead, making ZS data immediately + * available for any ZS tests taking place in other shaders. + * Thing's haven't been benchmarked to determine what's + * preferable (saving bandwidth vs having ZS preloaded + * earlier), so let's leave it like that for now. + */ + fb->bifrost.pre_post.modes[dcd_idx] = + desc_pool->dev->arch > 6 + ? MALI_PRE_POST_FRAME_SHADER_MODE_EARLY_ZS_ALWAYS + : always ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS + : MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT; + } else { + fb->bifrost.pre_post.modes[dcd_idx] = + always_write ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS + : MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT; + } } #else static struct panfrost_ptr pan_preload_emit_tiler_job(struct pan_pool *desc_pool, struct pan_scoreboard *scoreboard, - struct pan_fb_info *fb, bool zs, - mali_ptr coords, mali_ptr tsd) + struct pan_fb_info *fb, bool zs, mali_ptr coords, + mali_ptr tsd) { - struct panfrost_ptr job = - pan_pool_alloc_desc(desc_pool, TILER_JOB); + struct panfrost_ptr job = pan_pool_alloc_desc(desc_pool, TILER_JOB); - pan_preload_emit_dcd(desc_pool, fb, zs, coords, tsd, - pan_section_ptr(job.cpu, TILER_JOB, DRAW), - false); + pan_preload_emit_dcd(desc_pool, fb, zs, coords, tsd, + pan_section_ptr(job.cpu, TILER_JOB, DRAW), false); - pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE, cfg) { - cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP; - cfg.index_count = 4; - cfg.job_task_split = 6; - } + pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE, cfg) { + cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP; + cfg.index_count = 4; + cfg.job_task_split = 6; + } - pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) { - cfg.constant = 1.0f; - } + pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) { + cfg.constant = 1.0f; + } - void *invoc = pan_section_ptr(job.cpu, - TILER_JOB, - INVOCATION); - panfrost_pack_work_groups_compute(invoc, 1, 4, - 1, 1, 1, 1, true, false); + void *invoc = pan_section_ptr(job.cpu, TILER_JOB, INVOCATION); + panfrost_pack_work_groups_compute(invoc, 1, 4, 1, 1, 1, 1, true, false); - panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_TILER, - false, false, 0, 0, &job, true); - return job; + panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_TILER, false, false, 0, + 0, &job, true); + return job; } #endif static struct panfrost_ptr -pan_preload_fb_part(struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - struct pan_fb_info *fb, bool zs, - mali_ptr coords, mali_ptr tsd, mali_ptr tiler) +pan_preload_fb_part(struct pan_pool *pool, struct pan_scoreboard *scoreboard, + struct pan_fb_info *fb, bool zs, mali_ptr coords, + mali_ptr tsd, mali_ptr tiler) { - struct panfrost_ptr job = { 0 }; + struct panfrost_ptr job = {0}; #if PAN_ARCH >= 6 - pan_preload_emit_pre_frame_dcd(pool, fb, zs, coords, tsd); + pan_preload_emit_pre_frame_dcd(pool, fb, zs, coords, tsd); #else - job = pan_preload_emit_tiler_job(pool, scoreboard, fb, zs, coords, tsd); + job = pan_preload_emit_tiler_job(pool, scoreboard, fb, zs, coords, tsd); #endif - return job; + return job; } unsigned -GENX(pan_preload_fb)(struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - struct pan_fb_info *fb, - mali_ptr tsd, mali_ptr tiler, +GENX(pan_preload_fb)(struct pan_pool *pool, struct pan_scoreboard *scoreboard, + struct pan_fb_info *fb, mali_ptr tsd, mali_ptr tiler, struct panfrost_ptr *jobs) { - bool preload_zs = pan_preload_needed(fb, true); - bool preload_rts = pan_preload_needed(fb, false); - mali_ptr coords; + bool preload_zs = pan_preload_needed(fb, true); + bool preload_rts = pan_preload_needed(fb, false); + mali_ptr coords; - if (!preload_zs && !preload_rts) - return 0; + if (!preload_zs && !preload_rts) + return 0; - float rect[] = { - 0.0, 0.0, 0.0, 1.0, - fb->width, 0.0, 0.0, 1.0, - 0.0, fb->height, 0.0, 1.0, - fb->width, fb->height, 0.0, 1.0, - }; + float rect[] = { + 0.0, 0.0, 0.0, 1.0, fb->width, 0.0, 0.0, 1.0, + 0.0, fb->height, 0.0, 1.0, fb->width, fb->height, 0.0, 1.0, + }; - coords = pan_pool_upload_aligned(pool, rect, - sizeof(rect), 64); + coords = pan_pool_upload_aligned(pool, rect, sizeof(rect), 64); - unsigned njobs = 0; - if (preload_zs) { - struct panfrost_ptr job = - pan_preload_fb_part(pool, scoreboard, fb, true, - coords, tsd, tiler); - if (jobs && job.cpu) - jobs[njobs++] = job; - } + unsigned njobs = 0; + if (preload_zs) { + struct panfrost_ptr job = + pan_preload_fb_part(pool, scoreboard, fb, true, coords, tsd, tiler); + if (jobs && job.cpu) + jobs[njobs++] = job; + } - if (preload_rts) { - struct panfrost_ptr job = - pan_preload_fb_part(pool, scoreboard, fb, false, - coords, tsd, tiler); - if (jobs && job.cpu) - jobs[njobs++] = job; - } + if (preload_rts) { + struct panfrost_ptr job = + pan_preload_fb_part(pool, scoreboard, fb, false, coords, tsd, tiler); + if (jobs && job.cpu) + jobs[njobs++] = job; + } - return njobs; + return njobs; } #if PAN_ARCH <= 7 @@ -1396,276 +1387,288 @@ GENX(pan_blit_ctx_init)(struct panfrost_device *dev, struct pan_pool *blit_pool, struct pan_blit_context *ctx) { - memset(ctx, 0, sizeof(*ctx)); + memset(ctx, 0, sizeof(*ctx)); - struct pan_image_view sviews[2] = { - { - .format = info->src.planes[0].format, - .image = info->src.planes[0].image, - .dim = info->src.planes[0].image->layout.dim == MALI_TEXTURE_DIMENSION_CUBE ? - MALI_TEXTURE_DIMENSION_2D : info->src.planes[0].image->layout.dim, - .first_level = info->src.level, - .last_level = info->src.level, - .first_layer = info->src.start.layer, - .last_layer = info->src.end.layer, - .swizzle = { - PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W, - }, - }, - }; + struct pan_image_view sviews[2] = { + { + .format = info->src.planes[0].format, + .image = info->src.planes[0].image, + .dim = + info->src.planes[0].image->layout.dim == MALI_TEXTURE_DIMENSION_CUBE + ? MALI_TEXTURE_DIMENSION_2D + : info->src.planes[0].image->layout.dim, + .first_level = info->src.level, + .last_level = info->src.level, + .first_layer = info->src.start.layer, + .last_layer = info->src.end.layer, + .swizzle = + { + PIPE_SWIZZLE_X, + PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, + PIPE_SWIZZLE_W, + }, + }, + }; - struct pan_image_view dview = { - .format = info->dst.planes[0].format, - .image = info->dst.planes[0].image, - .dim = info->dst.planes[0].image->layout.dim == MALI_TEXTURE_DIMENSION_1D ? - MALI_TEXTURE_DIMENSION_1D : MALI_TEXTURE_DIMENSION_2D, - .first_level = info->dst.level, - .last_level = info->dst.level, - .first_layer = info->dst.start.layer, - .last_layer = info->dst.start.layer, - .swizzle = { - PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W, - }, - }; + struct pan_image_view dview = { + .format = info->dst.planes[0].format, + .image = info->dst.planes[0].image, + .dim = info->dst.planes[0].image->layout.dim == MALI_TEXTURE_DIMENSION_1D + ? MALI_TEXTURE_DIMENSION_1D + : MALI_TEXTURE_DIMENSION_2D, + .first_level = info->dst.level, + .last_level = info->dst.level, + .first_layer = info->dst.start.layer, + .last_layer = info->dst.start.layer, + .swizzle = + { + PIPE_SWIZZLE_X, + PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, + PIPE_SWIZZLE_W, + }, + }; - ctx->src.start.x = info->src.start.x; - ctx->src.start.y = info->src.start.y; - ctx->src.end.x = info->src.end.x; - ctx->src.end.y = info->src.end.y; - ctx->src.dim = sviews[0].dim; + ctx->src.start.x = info->src.start.x; + ctx->src.start.y = info->src.start.y; + ctx->src.end.x = info->src.end.x; + ctx->src.end.y = info->src.end.y; + ctx->src.dim = sviews[0].dim; - if (info->dst.planes[0].image->layout.dim == MALI_TEXTURE_DIMENSION_3D) { - unsigned max_z = u_minify(info->dst.planes[0].image->layout.depth, info->dst.level) - 1; + if (info->dst.planes[0].image->layout.dim == MALI_TEXTURE_DIMENSION_3D) { + unsigned max_z = + u_minify(info->dst.planes[0].image->layout.depth, info->dst.level) - 1; - ctx->z_scale = (float)(info->src.end.z - info->src.start.z) / - (info->dst.end.z - info->dst.start.z); - assert(info->dst.start.z != info->dst.end.z); - if (info->dst.start.z > info->dst.end.z) { - ctx->dst.cur_layer = info->dst.start.z - 1; - ctx->dst.last_layer = info->dst.end.z; - } else { - ctx->dst.cur_layer = info->dst.start.z; - ctx->dst.last_layer = info->dst.end.z - 1; - } - ctx->dst.cur_layer = MIN2(MAX2(ctx->dst.cur_layer, 0), max_z); - ctx->dst.last_layer = MIN2(MAX2(ctx->dst.last_layer, 0), max_z); - ctx->dst.layer_offset = ctx->dst.cur_layer; - } else { - unsigned max_layer = info->dst.planes[0].image->layout.array_size - 1; - ctx->dst.layer_offset = info->dst.start.layer; - ctx->dst.cur_layer = info->dst.start.layer; - ctx->dst.last_layer = MIN2(info->dst.end.layer, max_layer); - ctx->z_scale = 1; - } + ctx->z_scale = (float)(info->src.end.z - info->src.start.z) / + (info->dst.end.z - info->dst.start.z); + assert(info->dst.start.z != info->dst.end.z); + if (info->dst.start.z > info->dst.end.z) { + ctx->dst.cur_layer = info->dst.start.z - 1; + ctx->dst.last_layer = info->dst.end.z; + } else { + ctx->dst.cur_layer = info->dst.start.z; + ctx->dst.last_layer = info->dst.end.z - 1; + } + ctx->dst.cur_layer = MIN2(MAX2(ctx->dst.cur_layer, 0), max_z); + ctx->dst.last_layer = MIN2(MAX2(ctx->dst.last_layer, 0), max_z); + ctx->dst.layer_offset = ctx->dst.cur_layer; + } else { + unsigned max_layer = info->dst.planes[0].image->layout.array_size - 1; + ctx->dst.layer_offset = info->dst.start.layer; + ctx->dst.cur_layer = info->dst.start.layer; + ctx->dst.last_layer = MIN2(info->dst.end.layer, max_layer); + ctx->z_scale = 1; + } - if (sviews[0].dim == MALI_TEXTURE_DIMENSION_3D) { - if (info->src.start.z < info->src.end.z) - ctx->src.z_offset = info->src.start.z + fabs(ctx->z_scale * 0.5f); - else - ctx->src.z_offset = info->src.start.z - fabs(ctx->z_scale * 0.5f); - } else { - ctx->src.layer_offset = info->src.start.layer; - } + if (sviews[0].dim == MALI_TEXTURE_DIMENSION_3D) { + if (info->src.start.z < info->src.end.z) + ctx->src.z_offset = info->src.start.z + fabs(ctx->z_scale * 0.5f); + else + ctx->src.z_offset = info->src.start.z - fabs(ctx->z_scale * 0.5f); + } else { + ctx->src.layer_offset = info->src.start.layer; + } - /* Split depth and stencil */ - if (util_format_is_depth_and_stencil(sviews[0].format)) { - sviews[1] = sviews[0]; - sviews[0].format = util_format_get_depth_only(sviews[0].format); - sviews[1].format = util_format_stencil_only(sviews[1].format); - } else if (info->src.planes[1].format) { - sviews[1] = sviews[0]; - sviews[1].format = info->src.planes[1].format; - sviews[1].image = info->src.planes[1].image; - } + /* Split depth and stencil */ + if (util_format_is_depth_and_stencil(sviews[0].format)) { + sviews[1] = sviews[0]; + sviews[0].format = util_format_get_depth_only(sviews[0].format); + sviews[1].format = util_format_stencil_only(sviews[1].format); + } else if (info->src.planes[1].format) { + sviews[1] = sviews[0]; + sviews[1].format = info->src.planes[1].format; + sviews[1].image = info->src.planes[1].image; + } - ctx->rsd = pan_blit_get_rsd(dev, sviews, &dview); + ctx->rsd = pan_blit_get_rsd(dev, sviews, &dview); - ASSERTED unsigned nlayers = info->src.end.layer - info->src.start.layer + 1; + ASSERTED unsigned nlayers = info->src.end.layer - info->src.start.layer + 1; - assert(nlayers == (info->dst.end.layer - info->dst.start.layer + 1)); + assert(nlayers == (info->dst.end.layer - info->dst.start.layer + 1)); - unsigned dst_w = u_minify(info->dst.planes[0].image->layout.width, info->dst.level); - unsigned dst_h = u_minify(info->dst.planes[0].image->layout.height, info->dst.level); - unsigned maxx = MIN2(MAX2(info->dst.start.x, info->dst.end.x), dst_w - 1); - unsigned maxy = MIN2(MAX2(info->dst.start.y, info->dst.end.y), dst_h - 1); - unsigned minx = MAX2(MIN3(info->dst.start.x, info->dst.end.x, maxx), 0); - unsigned miny = MAX2(MIN3(info->dst.start.y, info->dst.end.y, maxy), 0); + unsigned dst_w = + u_minify(info->dst.planes[0].image->layout.width, info->dst.level); + unsigned dst_h = + u_minify(info->dst.planes[0].image->layout.height, info->dst.level); + unsigned maxx = MIN2(MAX2(info->dst.start.x, info->dst.end.x), dst_w - 1); + unsigned maxy = MIN2(MAX2(info->dst.start.y, info->dst.end.y), dst_h - 1); + unsigned minx = MAX2(MIN3(info->dst.start.x, info->dst.end.x, maxx), 0); + unsigned miny = MAX2(MIN3(info->dst.start.y, info->dst.end.y, maxy), 0); - if (info->scissor.enable) { - minx = MAX2(minx, info->scissor.minx); - miny = MAX2(miny, info->scissor.miny); - maxx = MIN2(maxx, info->scissor.maxx); - maxy = MIN2(maxy, info->scissor.maxy); - } + if (info->scissor.enable) { + minx = MAX2(minx, info->scissor.minx); + miny = MAX2(miny, info->scissor.miny); + maxx = MIN2(maxx, info->scissor.maxx); + maxy = MIN2(maxy, info->scissor.maxy); + } - const struct pan_image_view *sview_ptrs[] = { &sviews[0], &sviews[1] }; - unsigned nviews = sviews[1].format ? 2 : 1; + const struct pan_image_view *sview_ptrs[] = {&sviews[0], &sviews[1]}; + unsigned nviews = sviews[1].format ? 2 : 1; - ctx->textures = pan_blitter_emit_textures(blit_pool, nviews, sview_ptrs); - ctx->samplers = pan_blitter_emit_sampler(blit_pool, info->nearest); + ctx->textures = pan_blitter_emit_textures(blit_pool, nviews, sview_ptrs); + ctx->samplers = pan_blitter_emit_sampler(blit_pool, info->nearest); - ctx->vpd = pan_blitter_emit_viewport(blit_pool, - minx, miny, maxx, maxy); + ctx->vpd = pan_blitter_emit_viewport(blit_pool, minx, miny, maxx, maxy); - float dst_rect[] = { - info->dst.start.x, info->dst.start.y, 0.0, 1.0, - info->dst.end.x, info->dst.start.y, 0.0, 1.0, - info->dst.start.x, info->dst.end.y, 0.0, 1.0, - info->dst.end.x, info->dst.end.y, 0.0, 1.0, - }; + float dst_rect[] = { + info->dst.start.x, info->dst.start.y, 0.0, 1.0, + info->dst.end.x, info->dst.start.y, 0.0, 1.0, + info->dst.start.x, info->dst.end.y, 0.0, 1.0, + info->dst.end.x, info->dst.end.y, 0.0, 1.0, + }; - ctx->position = - pan_pool_upload_aligned(blit_pool, dst_rect, - sizeof(dst_rect), 64); + ctx->position = + pan_pool_upload_aligned(blit_pool, dst_rect, sizeof(dst_rect), 64); } struct panfrost_ptr -GENX(pan_blit)(struct pan_blit_context *ctx, - struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - mali_ptr tsd, mali_ptr tiler) +GENX(pan_blit)(struct pan_blit_context *ctx, struct pan_pool *pool, + struct pan_scoreboard *scoreboard, mali_ptr tsd, mali_ptr tiler) { - if (ctx->dst.cur_layer < 0 || - (ctx->dst.last_layer >= ctx->dst.layer_offset && - ctx->dst.cur_layer > ctx->dst.last_layer) || - (ctx->dst.last_layer < ctx->dst.layer_offset && - ctx->dst.cur_layer < ctx->dst.last_layer)) - return (struct panfrost_ptr){ 0 }; + if (ctx->dst.cur_layer < 0 || + (ctx->dst.last_layer >= ctx->dst.layer_offset && + ctx->dst.cur_layer > ctx->dst.last_layer) || + (ctx->dst.last_layer < ctx->dst.layer_offset && + ctx->dst.cur_layer < ctx->dst.last_layer)) + return (struct panfrost_ptr){0}; - int32_t layer = ctx->dst.cur_layer - ctx->dst.layer_offset; - float src_z; - if (ctx->src.dim == MALI_TEXTURE_DIMENSION_3D) - src_z = (ctx->z_scale * layer) + ctx->src.z_offset; - else - src_z = ctx->src.layer_offset + layer; + int32_t layer = ctx->dst.cur_layer - ctx->dst.layer_offset; + float src_z; + if (ctx->src.dim == MALI_TEXTURE_DIMENSION_3D) + src_z = (ctx->z_scale * layer) + ctx->src.z_offset; + else + src_z = ctx->src.layer_offset + layer; - float src_rect[] = { - ctx->src.start.x, ctx->src.start.y, src_z, 1.0, - ctx->src.end.x, ctx->src.start.y, src_z, 1.0, - ctx->src.start.x, ctx->src.end.y, src_z, 1.0, - ctx->src.end.x, ctx->src.end.y, src_z, 1.0, - }; + float src_rect[] = { + ctx->src.start.x, ctx->src.start.y, src_z, 1.0, + ctx->src.end.x, ctx->src.start.y, src_z, 1.0, + ctx->src.start.x, ctx->src.end.y, src_z, 1.0, + ctx->src.end.x, ctx->src.end.y, src_z, 1.0, + }; - mali_ptr src_coords = - pan_pool_upload_aligned(pool, src_rect, - sizeof(src_rect), 64); + mali_ptr src_coords = + pan_pool_upload_aligned(pool, src_rect, sizeof(src_rect), 64); - struct panfrost_ptr job = { 0 }; - void *dcd = pan_blit_emit_tiler_job(pool, scoreboard, tiler, &job); + struct panfrost_ptr job = {0}; + void *dcd = pan_blit_emit_tiler_job(pool, scoreboard, tiler, &job); - pan_pack(dcd, DRAW, cfg) { - cfg.thread_storage = tsd; - cfg.state = ctx->rsd; + pan_pack(dcd, DRAW, cfg) { + cfg.thread_storage = tsd; + cfg.state = ctx->rsd; - cfg.position = ctx->position; - cfg.varyings = pan_blitter_emit_varying(pool); - cfg.varying_buffers = pan_blitter_emit_varying_buffer(pool, src_coords); - cfg.viewport = ctx->vpd; - cfg.textures = ctx->textures; - cfg.samplers = ctx->samplers; - } + cfg.position = ctx->position; + cfg.varyings = pan_blitter_emit_varying(pool); + cfg.varying_buffers = pan_blitter_emit_varying_buffer(pool, src_coords); + cfg.viewport = ctx->vpd; + cfg.textures = ctx->textures; + cfg.samplers = ctx->samplers; + } - return job; + return job; } #endif -static uint32_t pan_blit_shader_key_hash(const void *key) +static uint32_t +pan_blit_shader_key_hash(const void *key) { - return _mesa_hash_data(key, sizeof(struct pan_blit_shader_key)); + return _mesa_hash_data(key, sizeof(struct pan_blit_shader_key)); } -static bool pan_blit_shader_key_equal(const void *a, const void *b) +static bool +pan_blit_shader_key_equal(const void *a, const void *b) { - return !memcmp(a, b, sizeof(struct pan_blit_shader_key)); + return !memcmp(a, b, sizeof(struct pan_blit_shader_key)); } -static uint32_t pan_blit_blend_shader_key_hash(const void *key) +static uint32_t +pan_blit_blend_shader_key_hash(const void *key) { - return _mesa_hash_data(key, sizeof(struct pan_blit_blend_shader_key)); + return _mesa_hash_data(key, sizeof(struct pan_blit_blend_shader_key)); } -static bool pan_blit_blend_shader_key_equal(const void *a, const void *b) +static bool +pan_blit_blend_shader_key_equal(const void *a, const void *b) { - return !memcmp(a, b, sizeof(struct pan_blit_blend_shader_key)); + return !memcmp(a, b, sizeof(struct pan_blit_blend_shader_key)); } -static uint32_t pan_blit_rsd_key_hash(const void *key) +static uint32_t +pan_blit_rsd_key_hash(const void *key) { - return _mesa_hash_data(key, sizeof(struct pan_blit_rsd_key)); + return _mesa_hash_data(key, sizeof(struct pan_blit_rsd_key)); } -static bool pan_blit_rsd_key_equal(const void *a, const void *b) +static bool +pan_blit_rsd_key_equal(const void *a, const void *b) { - return !memcmp(a, b, sizeof(struct pan_blit_rsd_key)); + return !memcmp(a, b, sizeof(struct pan_blit_rsd_key)); } static void pan_blitter_prefill_blit_shader_cache(struct panfrost_device *dev) { - static const struct pan_blit_shader_key prefill[] = { - { - .surfaces[0] = { - .loc = FRAG_RESULT_DEPTH, - .type = nir_type_float32, - .dim = MALI_TEXTURE_DIMENSION_2D, - .src_samples = 1, - .dst_samples = 1, - }, - }, - { - .surfaces[1] = { - .loc = FRAG_RESULT_STENCIL, - .type = nir_type_uint32, - .dim = MALI_TEXTURE_DIMENSION_2D, - .src_samples = 1, - .dst_samples = 1, - }, - }, - { - .surfaces[0] = { - .loc = FRAG_RESULT_DATA0, - .type = nir_type_float32, - .dim = MALI_TEXTURE_DIMENSION_2D, - .src_samples = 1, - .dst_samples = 1, - }, - }, - }; + static const struct pan_blit_shader_key prefill[] = { + { + .surfaces[0] = + { + .loc = FRAG_RESULT_DEPTH, + .type = nir_type_float32, + .dim = MALI_TEXTURE_DIMENSION_2D, + .src_samples = 1, + .dst_samples = 1, + }, + }, + { + .surfaces[1] = + { + .loc = FRAG_RESULT_STENCIL, + .type = nir_type_uint32, + .dim = MALI_TEXTURE_DIMENSION_2D, + .src_samples = 1, + .dst_samples = 1, + }, + }, + { + .surfaces[0] = + { + .loc = FRAG_RESULT_DATA0, + .type = nir_type_float32, + .dim = MALI_TEXTURE_DIMENSION_2D, + .src_samples = 1, + .dst_samples = 1, + }, + }, + }; - for (unsigned i = 0; i < ARRAY_SIZE(prefill); i++) - pan_blitter_get_blit_shader(dev, &prefill[i]); + for (unsigned i = 0; i < ARRAY_SIZE(prefill); i++) + pan_blitter_get_blit_shader(dev, &prefill[i]); } void -GENX(pan_blitter_init)(struct panfrost_device *dev, - struct pan_pool *bin_pool, +GENX(pan_blitter_init)(struct panfrost_device *dev, struct pan_pool *bin_pool, struct pan_pool *desc_pool) { - dev->blitter.shaders.blit = - _mesa_hash_table_create(NULL, pan_blit_shader_key_hash, - pan_blit_shader_key_equal); - dev->blitter.shaders.blend = - _mesa_hash_table_create(NULL, pan_blit_blend_shader_key_hash, - pan_blit_blend_shader_key_equal); - dev->blitter.shaders.pool = bin_pool; - pthread_mutex_init(&dev->blitter.shaders.lock, NULL); - pan_blitter_prefill_blit_shader_cache(dev); + dev->blitter.shaders.blit = _mesa_hash_table_create( + NULL, pan_blit_shader_key_hash, pan_blit_shader_key_equal); + dev->blitter.shaders.blend = _mesa_hash_table_create( + NULL, pan_blit_blend_shader_key_hash, pan_blit_blend_shader_key_equal); + dev->blitter.shaders.pool = bin_pool; + pthread_mutex_init(&dev->blitter.shaders.lock, NULL); + pan_blitter_prefill_blit_shader_cache(dev); - dev->blitter.rsds.pool = desc_pool; - dev->blitter.rsds.rsds = - _mesa_hash_table_create(NULL, pan_blit_rsd_key_hash, - pan_blit_rsd_key_equal); - pthread_mutex_init(&dev->blitter.rsds.lock, NULL); + dev->blitter.rsds.pool = desc_pool; + dev->blitter.rsds.rsds = _mesa_hash_table_create(NULL, pan_blit_rsd_key_hash, + pan_blit_rsd_key_equal); + pthread_mutex_init(&dev->blitter.rsds.lock, NULL); } void GENX(pan_blitter_cleanup)(struct panfrost_device *dev) { - _mesa_hash_table_destroy(dev->blitter.shaders.blit, NULL); - _mesa_hash_table_destroy(dev->blitter.shaders.blend, NULL); - pthread_mutex_destroy(&dev->blitter.shaders.lock); - _mesa_hash_table_destroy(dev->blitter.rsds.rsds, NULL); - pthread_mutex_destroy(&dev->blitter.rsds.lock); + _mesa_hash_table_destroy(dev->blitter.shaders.blit, NULL); + _mesa_hash_table_destroy(dev->blitter.shaders.blend, NULL); + pthread_mutex_destroy(&dev->blitter.shaders.lock); + _mesa_hash_table_destroy(dev->blitter.rsds.rsds, NULL); + pthread_mutex_destroy(&dev->blitter.rsds.lock); } diff --git a/src/panfrost/lib/pan_blitter.h b/src/panfrost/lib/pan_blitter.h index cb71161f5c5..6381a90f574 100644 --- a/src/panfrost/lib/pan_blitter.h +++ b/src/panfrost/lib/pan_blitter.h @@ -27,12 +27,12 @@ #include "genxml/gen_macros.h" -#include "panfrost-job.h" +#include "util/format/u_format.h" #include "pan_cs.h" #include "pan_pool.h" #include "pan_texture.h" #include "pan_util.h" -#include "util/format/u_format.h" +#include "panfrost-job.h" struct pan_fb_info; struct pan_scoreboard; @@ -40,90 +40,84 @@ struct pan_pool; struct panfrost_device; struct pan_blit_info { - struct { - struct { - const struct pan_image *image; - enum pipe_format format; - } planes[2]; - unsigned level; - struct { - int32_t x, y, z; - unsigned layer; - } start, end; - } src, dst; - struct { - bool enable; - uint16_t minx, miny, maxx, maxy; - } scissor; - bool nearest; + struct { + struct { + const struct pan_image *image; + enum pipe_format format; + } planes[2]; + unsigned level; + struct { + int32_t x, y, z; + unsigned layer; + } start, end; + } src, dst; + struct { + bool enable; + uint16_t minx, miny, maxx, maxy; + } scissor; + bool nearest; }; struct pan_blit_context { - mali_ptr rsd, vpd; - mali_ptr textures; - mali_ptr samplers; - mali_ptr position; - struct { - enum mali_texture_dimension dim; - struct { - float x, y; - } start, end; - union { - unsigned layer_offset; - float z_offset; - }; - } src; - struct { - int32_t layer_offset; - int32_t cur_layer; - int32_t last_layer; - } dst; - float z_scale; + mali_ptr rsd, vpd; + mali_ptr textures; + mali_ptr samplers; + mali_ptr position; + struct { + enum mali_texture_dimension dim; + struct { + float x, y; + } start, end; + union { + unsigned layer_offset; + float z_offset; + }; + } src; + struct { + int32_t layer_offset; + int32_t cur_layer; + int32_t last_layer; + } dst; + float z_scale; }; -void -GENX(pan_blitter_init)(struct panfrost_device *dev, - struct pan_pool *bin_pool, - struct pan_pool *desc_pool); +void GENX(pan_blitter_init)(struct panfrost_device *dev, + struct pan_pool *bin_pool, + struct pan_pool *desc_pool); -void -GENX(pan_blitter_cleanup)(struct panfrost_device *dev); +void GENX(pan_blitter_cleanup)(struct panfrost_device *dev); -unsigned -GENX(pan_preload_fb)(struct pan_pool *desc_pool, - struct pan_scoreboard *scoreboard, - struct pan_fb_info *fb, - mali_ptr tsd, mali_ptr tiler, - struct panfrost_ptr *jobs); +unsigned GENX(pan_preload_fb)(struct pan_pool *desc_pool, + struct pan_scoreboard *scoreboard, + struct pan_fb_info *fb, mali_ptr tsd, + mali_ptr tiler, struct panfrost_ptr *jobs); -void -GENX(pan_blit_ctx_init)(struct panfrost_device *dev, - const struct pan_blit_info *info, - struct pan_pool *blit_pool, - struct pan_blit_context *ctx); +void GENX(pan_blit_ctx_init)(struct panfrost_device *dev, + const struct pan_blit_info *info, + struct pan_pool *blit_pool, + struct pan_blit_context *ctx); static inline bool pan_blit_next_surface(struct pan_blit_context *ctx) { - if (ctx->dst.last_layer < ctx->dst.layer_offset) { - if (ctx->dst.cur_layer <= ctx->dst.last_layer) - return false; + if (ctx->dst.last_layer < ctx->dst.layer_offset) { + if (ctx->dst.cur_layer <= ctx->dst.last_layer) + return false; - ctx->dst.cur_layer--; - } else { - if (ctx->dst.cur_layer >= ctx->dst.last_layer) - return false; + ctx->dst.cur_layer--; + } else { + if (ctx->dst.cur_layer >= ctx->dst.last_layer) + return false; - ctx->dst.cur_layer++; - } + ctx->dst.cur_layer++; + } - return true; + return true; } -struct panfrost_ptr -GENX(pan_blit)(struct pan_blit_context *ctx, - struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - mali_ptr tsd, mali_ptr tiler); +struct panfrost_ptr GENX(pan_blit)(struct pan_blit_context *ctx, + struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + mali_ptr tsd, mali_ptr tiler); #endif diff --git a/src/panfrost/lib/pan_bo.c b/src/panfrost/lib/pan_bo.c index b606d1b0359..145a039092a 100644 --- a/src/panfrost/lib/pan_bo.c +++ b/src/panfrost/lib/pan_bo.c @@ -24,10 +24,10 @@ * Alyssa Rosenzweig */ #include -#include #include -#include #include +#include +#include #include "drm-uapi/panfrost_drm.h" #include "pan_bo.h" @@ -56,53 +56,53 @@ */ static struct panfrost_bo * -panfrost_bo_alloc(struct panfrost_device *dev, size_t size, - uint32_t flags, const char *label) +panfrost_bo_alloc(struct panfrost_device *dev, size_t size, uint32_t flags, + const char *label) { - struct drm_panfrost_create_bo create_bo = { .size = size }; - struct panfrost_bo *bo; - int ret; + struct drm_panfrost_create_bo create_bo = {.size = size}; + struct panfrost_bo *bo; + int ret; - if (dev->kernel_version->version_major > 1 || - dev->kernel_version->version_minor >= 1) { - if (flags & PAN_BO_GROWABLE) - create_bo.flags |= PANFROST_BO_HEAP; - if (!(flags & PAN_BO_EXECUTE)) - create_bo.flags |= PANFROST_BO_NOEXEC; - } + if (dev->kernel_version->version_major > 1 || + dev->kernel_version->version_minor >= 1) { + if (flags & PAN_BO_GROWABLE) + create_bo.flags |= PANFROST_BO_HEAP; + if (!(flags & PAN_BO_EXECUTE)) + create_bo.flags |= PANFROST_BO_NOEXEC; + } - ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); - if (ret) { - fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n"); - return NULL; - } + ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); + if (ret) { + fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n"); + return NULL; + } - bo = pan_lookup_bo(dev, create_bo.handle); - assert(!memcmp(bo, &((struct panfrost_bo){}), sizeof(*bo))); + bo = pan_lookup_bo(dev, create_bo.handle); + assert(!memcmp(bo, &((struct panfrost_bo){}), sizeof(*bo))); - bo->size = create_bo.size; - bo->ptr.gpu = create_bo.offset; - bo->gem_handle = create_bo.handle; - bo->flags = flags; - bo->dev = dev; - bo->label = label; - return bo; + bo->size = create_bo.size; + bo->ptr.gpu = create_bo.offset; + bo->gem_handle = create_bo.handle; + bo->flags = flags; + bo->dev = dev; + bo->label = label; + return bo; } static void panfrost_bo_free(struct panfrost_bo *bo) { - struct drm_gem_close gem_close = { .handle = bo->gem_handle }; - int ret; + struct drm_gem_close gem_close = {.handle = bo->gem_handle}; + int ret; - ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); - if (ret) { - fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n"); - assert(0); - } + ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); + if (ret) { + fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n"); + assert(0); + } - /* BO will be freed with the sparse array, but zero to indicate free */ - memset(bo, 0, sizeof(*bo)); + /* BO will be freed with the sparse array, but zero to indicate free */ + memset(bo, 0, sizeof(*bo)); } /* Returns true if the BO is ready, false otherwise. @@ -113,44 +113,44 @@ panfrost_bo_free(struct panfrost_bo *bo) bool panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) { - struct drm_panfrost_wait_bo req = { - .handle = bo->gem_handle, - .timeout_ns = timeout_ns, - }; - int ret; + struct drm_panfrost_wait_bo req = { + .handle = bo->gem_handle, + .timeout_ns = timeout_ns, + }; + int ret; - /* If the BO has been exported or imported we can't rely on the cached - * state, we need to call the WAIT_BO ioctl. - */ - if (!(bo->flags & PAN_BO_SHARED)) { - /* If ->gpu_access is 0, the BO is idle, no need to wait. */ - if (!bo->gpu_access) - return true; + /* If the BO has been exported or imported we can't rely on the cached + * state, we need to call the WAIT_BO ioctl. + */ + if (!(bo->flags & PAN_BO_SHARED)) { + /* If ->gpu_access is 0, the BO is idle, no need to wait. */ + if (!bo->gpu_access) + return true; - /* If the caller only wants to wait for writers and no - * writes are pending, we don't have to wait. - */ - if (!wait_readers && !(bo->gpu_access & PAN_BO_ACCESS_WRITE)) - return true; - } + /* If the caller only wants to wait for writers and no + * writes are pending, we don't have to wait. + */ + if (!wait_readers && !(bo->gpu_access & PAN_BO_ACCESS_WRITE)) + return true; + } - /* The ioctl returns >= 0 value when the BO we are waiting for is ready - * -1 otherwise. - */ - ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req); - if (ret != -1) { - /* Set gpu_access to 0 so that the next call to bo_wait() - * doesn't have to call the WAIT_BO ioctl. - */ - bo->gpu_access = 0; - return true; - } + /* The ioctl returns >= 0 value when the BO we are waiting for is ready + * -1 otherwise. + */ + ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req); + if (ret != -1) { + /* Set gpu_access to 0 so that the next call to bo_wait() + * doesn't have to call the WAIT_BO ioctl. + */ + bo->gpu_access = 0; + return true; + } - /* If errno is not ETIMEDOUT or EBUSY that means the handle we passed - * is invalid, which shouldn't happen here. - */ - assert(errno == ETIMEDOUT || errno == EBUSY); - return false; + /* If errno is not ETIMEDOUT or EBUSY that means the handle we passed + * is invalid, which shouldn't happen here. + */ + assert(errno == ETIMEDOUT || errno == EBUSY); + return false; } /* Helper to calculate the bucket index of a BO */ @@ -158,24 +158,23 @@ panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) static unsigned pan_bucket_index(unsigned size) { - /* Round down to POT to compute a bucket index */ + /* Round down to POT to compute a bucket index */ - unsigned bucket_index = util_logbase2(size); + unsigned bucket_index = util_logbase2(size); - /* Clamp the bucket index; all huge allocations will be - * sorted into the largest bucket */ + /* Clamp the bucket index; all huge allocations will be + * sorted into the largest bucket */ - bucket_index = CLAMP(bucket_index, MIN_BO_CACHE_BUCKET, - MAX_BO_CACHE_BUCKET); + bucket_index = CLAMP(bucket_index, MIN_BO_CACHE_BUCKET, MAX_BO_CACHE_BUCKET); - /* Reindex from 0 */ - return (bucket_index - MIN_BO_CACHE_BUCKET); + /* Reindex from 0 */ + return (bucket_index - MIN_BO_CACHE_BUCKET); } static struct list_head * pan_bucket(struct panfrost_device *dev, unsigned size) { - return &dev->bo_cache.buckets[pan_bucket_index(size)]; + return &dev->bo_cache.buckets[pan_bucket_index(size)]; } /* Tries to fetch a BO of sufficient size with the appropriate flags from the @@ -184,74 +183,71 @@ pan_bucket(struct panfrost_device *dev, unsigned size) * BO. */ static struct panfrost_bo * -panfrost_bo_cache_fetch(struct panfrost_device *dev, - size_t size, uint32_t flags, const char *label, - bool dontwait) +panfrost_bo_cache_fetch(struct panfrost_device *dev, size_t size, + uint32_t flags, const char *label, bool dontwait) { - pthread_mutex_lock(&dev->bo_cache.lock); - struct list_head *bucket = pan_bucket(dev, size); - struct panfrost_bo *bo = NULL; + pthread_mutex_lock(&dev->bo_cache.lock); + struct list_head *bucket = pan_bucket(dev, size); + struct panfrost_bo *bo = NULL; - /* Iterate the bucket looking for something suitable */ - list_for_each_entry_safe(struct panfrost_bo, entry, bucket, - bucket_link) { - if (entry->size < size || entry->flags != flags) - continue; + /* Iterate the bucket looking for something suitable */ + list_for_each_entry_safe(struct panfrost_bo, entry, bucket, bucket_link) { + if (entry->size < size || entry->flags != flags) + continue; - /* If the oldest BO in the cache is busy, likely so is - * everything newer, so bail. */ - if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX, - PAN_BO_ACCESS_RW)) - break; + /* If the oldest BO in the cache is busy, likely so is + * everything newer, so bail. */ + if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX, PAN_BO_ACCESS_RW)) + break; - struct drm_panfrost_madvise madv = { - .handle = entry->gem_handle, - .madv = PANFROST_MADV_WILLNEED, - }; - int ret; + struct drm_panfrost_madvise madv = { + .handle = entry->gem_handle, + .madv = PANFROST_MADV_WILLNEED, + }; + int ret; - /* This one works, splice it out of the cache */ - list_del(&entry->bucket_link); - list_del(&entry->lru_link); + /* This one works, splice it out of the cache */ + list_del(&entry->bucket_link); + list_del(&entry->lru_link); - ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); - if (!ret && !madv.retained) { - panfrost_bo_free(entry); - continue; - } - /* Let's go! */ - bo = entry; - bo->label = label; - break; - } - pthread_mutex_unlock(&dev->bo_cache.lock); + ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); + if (!ret && !madv.retained) { + panfrost_bo_free(entry); + continue; + } + /* Let's go! */ + bo = entry; + bo->label = label; + break; + } + pthread_mutex_unlock(&dev->bo_cache.lock); - return bo; + return bo; } static void panfrost_bo_cache_evict_stale_bos(struct panfrost_device *dev) { - struct timespec time; + struct timespec time; - clock_gettime(CLOCK_MONOTONIC, &time); - list_for_each_entry_safe(struct panfrost_bo, entry, - &dev->bo_cache.lru, lru_link) { - /* We want all entries that have been used more than 1 sec - * ago to be dropped, others can be kept. - * Note the <= 2 check and not <= 1. It's here to account for - * the fact that we're only testing ->tv_sec, not ->tv_nsec. - * That means we might keep entries that are between 1 and 2 - * seconds old, but we don't really care, as long as unused BOs - * are dropped at some point. - */ - if (time.tv_sec - entry->last_used <= 2) - break; + clock_gettime(CLOCK_MONOTONIC, &time); + list_for_each_entry_safe(struct panfrost_bo, entry, &dev->bo_cache.lru, + lru_link) { + /* We want all entries that have been used more than 1 sec + * ago to be dropped, others can be kept. + * Note the <= 2 check and not <= 1. It's here to account for + * the fact that we're only testing ->tv_sec, not ->tv_nsec. + * That means we might keep entries that are between 1 and 2 + * seconds old, but we don't really care, as long as unused BOs + * are dropped at some point. + */ + if (time.tv_sec - entry->last_used <= 2) + break; - list_del(&entry->bucket_link); - list_del(&entry->lru_link); - panfrost_bo_free(entry); - } + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + panfrost_bo_free(entry); + } } /* Tries to add a BO to the cache. Returns if it was @@ -260,43 +256,43 @@ panfrost_bo_cache_evict_stale_bos(struct panfrost_device *dev) static bool panfrost_bo_cache_put(struct panfrost_bo *bo) { - struct panfrost_device *dev = bo->dev; + struct panfrost_device *dev = bo->dev; - if (bo->flags & PAN_BO_SHARED || dev->debug & PAN_DBG_NO_CACHE) - return false; + if (bo->flags & PAN_BO_SHARED || dev->debug & PAN_DBG_NO_CACHE) + return false; - /* Must be first */ - pthread_mutex_lock(&dev->bo_cache.lock); + /* Must be first */ + pthread_mutex_lock(&dev->bo_cache.lock); - struct list_head *bucket = pan_bucket(dev, MAX2(bo->size, 4096)); - struct drm_panfrost_madvise madv; - struct timespec time; + struct list_head *bucket = pan_bucket(dev, MAX2(bo->size, 4096)); + struct drm_panfrost_madvise madv; + struct timespec time; - madv.handle = bo->gem_handle; - madv.madv = PANFROST_MADV_DONTNEED; - madv.retained = 0; + madv.handle = bo->gem_handle; + madv.madv = PANFROST_MADV_DONTNEED; + madv.retained = 0; - drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); + drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); - /* Add us to the bucket */ - list_addtail(&bo->bucket_link, bucket); + /* Add us to the bucket */ + list_addtail(&bo->bucket_link, bucket); - /* Add us to the LRU list and update the last_used field. */ - list_addtail(&bo->lru_link, &dev->bo_cache.lru); - clock_gettime(CLOCK_MONOTONIC, &time); - bo->last_used = time.tv_sec; + /* Add us to the LRU list and update the last_used field. */ + list_addtail(&bo->lru_link, &dev->bo_cache.lru); + clock_gettime(CLOCK_MONOTONIC, &time); + bo->last_used = time.tv_sec; - /* Let's do some cleanup in the BO cache while we hold the - * lock. - */ - panfrost_bo_cache_evict_stale_bos(dev); + /* Let's do some cleanup in the BO cache while we hold the + * lock. + */ + panfrost_bo_cache_evict_stale_bos(dev); - /* Update the label to help debug BO cache memory usage issues */ - bo->label = "Unused (BO cache)"; + /* Update the label to help debug BO cache memory usage issues */ + bo->label = "Unused (BO cache)"; - /* Must be last */ - pthread_mutex_unlock(&dev->bo_cache.lock); - return true; + /* Must be last */ + pthread_mutex_unlock(&dev->bo_cache.lock); + return true; } /* Evicts all BOs from the cache. Called during context @@ -306,228 +302,226 @@ panfrost_bo_cache_put(struct panfrost_bo *bo) * OS) */ void -panfrost_bo_cache_evict_all( - struct panfrost_device *dev) +panfrost_bo_cache_evict_all(struct panfrost_device *dev) { - pthread_mutex_lock(&dev->bo_cache.lock); - for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) { - struct list_head *bucket = &dev->bo_cache.buckets[i]; + pthread_mutex_lock(&dev->bo_cache.lock); + for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) { + struct list_head *bucket = &dev->bo_cache.buckets[i]; - list_for_each_entry_safe(struct panfrost_bo, entry, bucket, - bucket_link) { - list_del(&entry->bucket_link); - list_del(&entry->lru_link); - panfrost_bo_free(entry); - } - } - pthread_mutex_unlock(&dev->bo_cache.lock); + list_for_each_entry_safe(struct panfrost_bo, entry, bucket, bucket_link) { + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + panfrost_bo_free(entry); + } + } + pthread_mutex_unlock(&dev->bo_cache.lock); } void panfrost_bo_mmap(struct panfrost_bo *bo) { - struct drm_panfrost_mmap_bo mmap_bo = { .handle = bo->gem_handle }; - int ret; + struct drm_panfrost_mmap_bo mmap_bo = {.handle = bo->gem_handle}; + int ret; - if (bo->ptr.cpu) - return; + if (bo->ptr.cpu) + return; - ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_MMAP_BO, &mmap_bo); - if (ret) { - fprintf(stderr, "DRM_IOCTL_PANFROST_MMAP_BO failed: %m\n"); - assert(0); - } + ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_MMAP_BO, &mmap_bo); + if (ret) { + fprintf(stderr, "DRM_IOCTL_PANFROST_MMAP_BO failed: %m\n"); + assert(0); + } - bo->ptr.cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, - bo->dev->fd, mmap_bo.offset); - if (bo->ptr.cpu == MAP_FAILED) { - bo->ptr.cpu = NULL; - fprintf(stderr, - "mmap failed: result=%p size=0x%llx fd=%i offset=0x%llx %m\n", - bo->ptr.cpu, (long long)bo->size, bo->dev->fd, - (long long)mmap_bo.offset); - } + bo->ptr.cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, + bo->dev->fd, mmap_bo.offset); + if (bo->ptr.cpu == MAP_FAILED) { + bo->ptr.cpu = NULL; + fprintf(stderr, + "mmap failed: result=%p size=0x%llx fd=%i offset=0x%llx %m\n", + bo->ptr.cpu, (long long)bo->size, bo->dev->fd, + (long long)mmap_bo.offset); + } } static void panfrost_bo_munmap(struct panfrost_bo *bo) { - if (!bo->ptr.cpu) - return; + if (!bo->ptr.cpu) + return; - if (os_munmap((void *) (uintptr_t)bo->ptr.cpu, bo->size)) { - perror("munmap"); - abort(); - } + if (os_munmap((void *)(uintptr_t)bo->ptr.cpu, bo->size)) { + perror("munmap"); + abort(); + } - bo->ptr.cpu = NULL; + bo->ptr.cpu = NULL; } struct panfrost_bo * -panfrost_bo_create(struct panfrost_device *dev, size_t size, - uint32_t flags, const char *label) +panfrost_bo_create(struct panfrost_device *dev, size_t size, uint32_t flags, + const char *label) { - struct panfrost_bo *bo; + struct panfrost_bo *bo; - /* Kernel will fail (confusingly) with EPERM otherwise */ - assert(size > 0); + /* Kernel will fail (confusingly) with EPERM otherwise */ + assert(size > 0); - /* To maximize BO cache usage, don't allocate tiny BOs */ - size = ALIGN_POT(size, 4096); + /* To maximize BO cache usage, don't allocate tiny BOs */ + size = ALIGN_POT(size, 4096); - /* GROWABLE BOs cannot be mmapped */ - if (flags & PAN_BO_GROWABLE) - assert(flags & PAN_BO_INVISIBLE); + /* GROWABLE BOs cannot be mmapped */ + if (flags & PAN_BO_GROWABLE) + assert(flags & PAN_BO_INVISIBLE); - /* Ideally, we get a BO that's ready in the cache, or allocate a fresh - * BO. If allocation fails, we can try waiting for something in the - * cache. But if there's no nothing suitable, we should flush the cache - * to make space for the new allocation. - */ - bo = panfrost_bo_cache_fetch(dev, size, flags, label, true); - if (!bo) - bo = panfrost_bo_alloc(dev, size, flags, label); - if (!bo) - bo = panfrost_bo_cache_fetch(dev, size, flags, label, false); - if (!bo) { - panfrost_bo_cache_evict_all(dev); - bo = panfrost_bo_alloc(dev, size, flags, label); - } + /* Ideally, we get a BO that's ready in the cache, or allocate a fresh + * BO. If allocation fails, we can try waiting for something in the + * cache. But if there's no nothing suitable, we should flush the cache + * to make space for the new allocation. + */ + bo = panfrost_bo_cache_fetch(dev, size, flags, label, true); + if (!bo) + bo = panfrost_bo_alloc(dev, size, flags, label); + if (!bo) + bo = panfrost_bo_cache_fetch(dev, size, flags, label, false); + if (!bo) { + panfrost_bo_cache_evict_all(dev); + bo = panfrost_bo_alloc(dev, size, flags, label); + } - if (!bo) { - unreachable("BO creation failed. We don't handle that yet."); - return NULL; - } + if (!bo) { + unreachable("BO creation failed. We don't handle that yet."); + return NULL; + } - /* Only mmap now if we know we need to. For CPU-invisible buffers, we - * never map since we don't care about their contents; they're purely - * for GPU-internal use. But we do trace them anyway. */ + /* Only mmap now if we know we need to. For CPU-invisible buffers, we + * never map since we don't care about their contents; they're purely + * for GPU-internal use. But we do trace them anyway. */ - if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP))) - panfrost_bo_mmap(bo); + if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP))) + panfrost_bo_mmap(bo); - p_atomic_set(&bo->refcnt, 1); + p_atomic_set(&bo->refcnt, 1); - if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { - if (flags & PAN_BO_INVISIBLE) - pandecode_inject_mmap(bo->ptr.gpu, NULL, bo->size, NULL); - else if (!(flags & PAN_BO_DELAY_MMAP)) - pandecode_inject_mmap(bo->ptr.gpu, bo->ptr.cpu, bo->size, NULL); - } + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { + if (flags & PAN_BO_INVISIBLE) + pandecode_inject_mmap(bo->ptr.gpu, NULL, bo->size, NULL); + else if (!(flags & PAN_BO_DELAY_MMAP)) + pandecode_inject_mmap(bo->ptr.gpu, bo->ptr.cpu, bo->size, NULL); + } - return bo; + return bo; } void panfrost_bo_reference(struct panfrost_bo *bo) { - if (bo) { - ASSERTED int count = p_atomic_inc_return(&bo->refcnt); - assert(count != 1); - } + if (bo) { + ASSERTED int count = p_atomic_inc_return(&bo->refcnt); + assert(count != 1); + } } void panfrost_bo_unreference(struct panfrost_bo *bo) { - if (!bo) - return; + if (!bo) + return; - /* Don't return to cache if there are still references */ - if (p_atomic_dec_return(&bo->refcnt)) - return; + /* Don't return to cache if there are still references */ + if (p_atomic_dec_return(&bo->refcnt)) + return; - struct panfrost_device *dev = bo->dev; + struct panfrost_device *dev = bo->dev; - pthread_mutex_lock(&dev->bo_map_lock); + pthread_mutex_lock(&dev->bo_map_lock); - /* Someone might have imported this BO while we were waiting for the - * lock, let's make sure it's still not referenced before freeing it. - */ - if (p_atomic_read(&bo->refcnt) == 0) { - /* When the reference count goes to zero, we need to cleanup */ - panfrost_bo_munmap(bo); + /* Someone might have imported this BO while we were waiting for the + * lock, let's make sure it's still not referenced before freeing it. + */ + if (p_atomic_read(&bo->refcnt) == 0) { + /* When the reference count goes to zero, we need to cleanup */ + panfrost_bo_munmap(bo); - if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) - pandecode_inject_free(bo->ptr.gpu, bo->size); + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) + pandecode_inject_free(bo->ptr.gpu, bo->size); - /* Rather than freeing the BO now, we'll cache the BO for later - * allocations if we're allowed to. - */ - if (!panfrost_bo_cache_put(bo)) - panfrost_bo_free(bo); - - } - pthread_mutex_unlock(&dev->bo_map_lock); + /* Rather than freeing the BO now, we'll cache the BO for later + * allocations if we're allowed to. + */ + if (!panfrost_bo_cache_put(bo)) + panfrost_bo_free(bo); + } + pthread_mutex_unlock(&dev->bo_map_lock); } struct panfrost_bo * panfrost_bo_import(struct panfrost_device *dev, int fd) { - struct panfrost_bo *bo; - struct drm_panfrost_get_bo_offset get_bo_offset = {0,}; - ASSERTED int ret; - unsigned gem_handle; + struct panfrost_bo *bo; + struct drm_panfrost_get_bo_offset get_bo_offset = { + 0, + }; + ASSERTED int ret; + unsigned gem_handle; - ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle); - assert(!ret); + ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle); + assert(!ret); - pthread_mutex_lock(&dev->bo_map_lock); - bo = pan_lookup_bo(dev, gem_handle); + pthread_mutex_lock(&dev->bo_map_lock); + bo = pan_lookup_bo(dev, gem_handle); - if (!bo->dev) { - get_bo_offset.handle = gem_handle; - ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); - assert(!ret); + if (!bo->dev) { + get_bo_offset.handle = gem_handle; + ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); + assert(!ret); - bo->dev = dev; - bo->ptr.gpu = (mali_ptr) get_bo_offset.offset; - bo->size = lseek(fd, 0, SEEK_END); - /* Sometimes this can fail and return -1. size of -1 is not - * a nice thing for mmap to try mmap. Be more robust also - * for zero sized maps and fail nicely too - */ - if ((bo->size == 0) || (bo->size == (size_t)-1)) { - pthread_mutex_unlock(&dev->bo_map_lock); - return NULL; - } - bo->flags = PAN_BO_SHARED; - bo->gem_handle = gem_handle; - p_atomic_set(&bo->refcnt, 1); - } else { - /* bo->refcnt == 0 can happen if the BO - * was being released but panfrost_bo_import() acquired the - * lock before panfrost_bo_unreference(). In that case, refcnt - * is 0 and we can't use panfrost_bo_reference() directly, we - * have to re-initialize the refcnt(). - * Note that panfrost_bo_unreference() checks - * refcnt value just after acquiring the lock to - * make sure the object is not freed if panfrost_bo_import() - * acquired it in the meantime. - */ - if (p_atomic_read(&bo->refcnt) == 0) - p_atomic_set(&bo->refcnt, 1); - else - panfrost_bo_reference(bo); - } - pthread_mutex_unlock(&dev->bo_map_lock); + bo->dev = dev; + bo->ptr.gpu = (mali_ptr)get_bo_offset.offset; + bo->size = lseek(fd, 0, SEEK_END); + /* Sometimes this can fail and return -1. size of -1 is not + * a nice thing for mmap to try mmap. Be more robust also + * for zero sized maps and fail nicely too + */ + if ((bo->size == 0) || (bo->size == (size_t)-1)) { + pthread_mutex_unlock(&dev->bo_map_lock); + return NULL; + } + bo->flags = PAN_BO_SHARED; + bo->gem_handle = gem_handle; + p_atomic_set(&bo->refcnt, 1); + } else { + /* bo->refcnt == 0 can happen if the BO + * was being released but panfrost_bo_import() acquired the + * lock before panfrost_bo_unreference(). In that case, refcnt + * is 0 and we can't use panfrost_bo_reference() directly, we + * have to re-initialize the refcnt(). + * Note that panfrost_bo_unreference() checks + * refcnt value just after acquiring the lock to + * make sure the object is not freed if panfrost_bo_import() + * acquired it in the meantime. + */ + if (p_atomic_read(&bo->refcnt) == 0) + p_atomic_set(&bo->refcnt, 1); + else + panfrost_bo_reference(bo); + } + pthread_mutex_unlock(&dev->bo_map_lock); - return bo; + return bo; } int panfrost_bo_export(struct panfrost_bo *bo) { - struct drm_prime_handle args = { - .handle = bo->gem_handle, - .flags = DRM_CLOEXEC, - }; + struct drm_prime_handle args = { + .handle = bo->gem_handle, + .flags = DRM_CLOEXEC, + }; - int ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args); - if (ret == -1) - return -1; + int ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args); + if (ret == -1) + return -1; - bo->flags |= PAN_BO_SHARED; - return args.fd; + bo->flags |= PAN_BO_SHARED; + return args.fd; } - diff --git a/src/panfrost/lib/pan_bo.h b/src/panfrost/lib/pan_bo.h index 7d19fba9dfc..4742fec5bd1 100644 --- a/src/panfrost/lib/pan_bo.h +++ b/src/panfrost/lib/pan_bo.h @@ -26,113 +26,106 @@ #ifndef __PAN_BO_H__ #define __PAN_BO_H__ +#include #include "util/list.h" #include "panfrost-job.h" -#include /* Flags for allocated memory */ /* This memory region is executable */ -#define PAN_BO_EXECUTE (1 << 0) +#define PAN_BO_EXECUTE (1 << 0) /* This memory region should be lazily allocated and grow-on-page-fault. Must * be used in conjunction with INVISIBLE */ -#define PAN_BO_GROWABLE (1 << 1) +#define PAN_BO_GROWABLE (1 << 1) /* This memory region should not be mapped to the CPU */ -#define PAN_BO_INVISIBLE (1 << 2) +#define PAN_BO_INVISIBLE (1 << 2) /* This region may not be used immediately and will not mmap on allocate * (semantically distinct from INVISIBLE, which cannot never be mmaped) */ -#define PAN_BO_DELAY_MMAP (1 << 3) +#define PAN_BO_DELAY_MMAP (1 << 3) /* BO is shared across processes (imported or exported) and therefore cannot be * cached locally */ -#define PAN_BO_SHARED (1 << 4) +#define PAN_BO_SHARED (1 << 4) /* GPU access flags */ /* BO is either shared (can be accessed by more than one GPU batch) or private * (reserved by a specific GPU job). */ -#define PAN_BO_ACCESS_PRIVATE (0 << 0) -#define PAN_BO_ACCESS_SHARED (1 << 0) +#define PAN_BO_ACCESS_PRIVATE (0 << 0) +#define PAN_BO_ACCESS_SHARED (1 << 0) /* BO is being read/written by the GPU */ -#define PAN_BO_ACCESS_READ (1 << 1) -#define PAN_BO_ACCESS_WRITE (1 << 2) -#define PAN_BO_ACCESS_RW (PAN_BO_ACCESS_READ | PAN_BO_ACCESS_WRITE) +#define PAN_BO_ACCESS_READ (1 << 1) +#define PAN_BO_ACCESS_WRITE (1 << 2) +#define PAN_BO_ACCESS_RW (PAN_BO_ACCESS_READ | PAN_BO_ACCESS_WRITE) /* BO is accessed by the vertex/tiler job. */ -#define PAN_BO_ACCESS_VERTEX_TILER (1 << 3) +#define PAN_BO_ACCESS_VERTEX_TILER (1 << 3) /* BO is accessed by the fragment job. */ -#define PAN_BO_ACCESS_FRAGMENT (1 << 4) +#define PAN_BO_ACCESS_FRAGMENT (1 << 4) typedef uint8_t pan_bo_access; struct panfrost_device; struct panfrost_ptr { - /* CPU address */ - void *cpu; + /* CPU address */ + void *cpu; - /* GPU address */ - mali_ptr gpu; + /* GPU address */ + mali_ptr gpu; }; struct panfrost_bo { - /* Must be first for casting */ - struct list_head bucket_link; + /* Must be first for casting */ + struct list_head bucket_link; - /* Used to link the BO to the BO cache LRU list. */ - struct list_head lru_link; + /* Used to link the BO to the BO cache LRU list. */ + struct list_head lru_link; - /* Store the time this BO was use last, so the BO cache logic can evict - * stale BOs. - */ - time_t last_used; + /* Store the time this BO was use last, so the BO cache logic can evict + * stale BOs. + */ + time_t last_used; - /* Atomic reference count */ - int32_t refcnt; + /* Atomic reference count */ + int32_t refcnt; - struct panfrost_device *dev; + struct panfrost_device *dev; - /* Mapping for the entire object (all levels) */ - struct panfrost_ptr ptr; + /* Mapping for the entire object (all levels) */ + struct panfrost_ptr ptr; - /* Size of all entire trees */ - size_t size; + /* Size of all entire trees */ + size_t size; - int gem_handle; + int gem_handle; - uint32_t flags; + uint32_t flags; - /* Combination of PAN_BO_ACCESS_{READ,WRITE} flags encoding pending - * GPU accesses to this BO. Useful to avoid calling the WAIT_BO ioctl - * when the BO is idle. - */ - uint32_t gpu_access; + /* Combination of PAN_BO_ACCESS_{READ,WRITE} flags encoding pending + * GPU accesses to this BO. Useful to avoid calling the WAIT_BO ioctl + * when the BO is idle. + */ + uint32_t gpu_access; - /* Human readable description of the BO for debugging. */ - const char *label; + /* Human readable description of the BO for debugging. */ + const char *label; }; -bool -panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers); -void -panfrost_bo_reference(struct panfrost_bo *bo); -void -panfrost_bo_unreference(struct panfrost_bo *bo); -struct panfrost_bo * -panfrost_bo_create(struct panfrost_device *dev, size_t size, - uint32_t flags, const char *label); -void -panfrost_bo_mmap(struct panfrost_bo *bo); -struct panfrost_bo * -panfrost_bo_import(struct panfrost_device *dev, int fd); -int -panfrost_bo_export(struct panfrost_bo *bo); -void -panfrost_bo_cache_evict_all(struct panfrost_device *dev); +bool panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, + bool wait_readers); +void panfrost_bo_reference(struct panfrost_bo *bo); +void panfrost_bo_unreference(struct panfrost_bo *bo); +struct panfrost_bo *panfrost_bo_create(struct panfrost_device *dev, size_t size, + uint32_t flags, const char *label); +void panfrost_bo_mmap(struct panfrost_bo *bo); +struct panfrost_bo *panfrost_bo_import(struct panfrost_device *dev, int fd); +int panfrost_bo_export(struct panfrost_bo *bo); +void panfrost_bo_cache_evict_all(struct panfrost_device *dev); #endif /* __PAN_BO_H__ */ diff --git a/src/panfrost/lib/pan_clear.c b/src/panfrost/lib/pan_clear.c index 4b7a302cf09..b1a8533a8ee 100644 --- a/src/panfrost/lib/pan_clear.c +++ b/src/panfrost/lib/pan_clear.c @@ -26,11 +26,11 @@ #include "genxml/gen_macros.h" #include -#include "pan_util.h" -#include "pan_format.h" #include "gallium/auxiliary/util/u_pack_color.h" -#include "util/rounding.h" #include "util/format_srgb.h" +#include "util/rounding.h" +#include "pan_format.h" +#include "pan_util.h" /* Clear colours are packed as the internal format of the tilebuffer, looked up * in the blendable formats table given the render target format. @@ -49,8 +49,8 @@ static void pan_pack_color_32(uint32_t *packed, uint32_t v) { - for (unsigned i = 0; i < 4; ++i) - packed[i] = v; + for (unsigned i = 0; i < 4; ++i) + packed[i] = v; } /* For m integer bits and n fractional bits, calculate the conversion factor, @@ -61,22 +61,22 @@ pan_pack_color_32(uint32_t *packed, uint32_t v) static inline uint32_t float_to_fixed(float f, unsigned bits_int, unsigned bits_frac, bool dither) { - uint32_t m = (1 << bits_int) - 1; + uint32_t m = (1 << bits_int) - 1; - if (dither) { - float factor = m << bits_frac; - return _mesa_roundevenf(f * factor); - } else { - uint32_t v = _mesa_roundevenf(f * (float) m); - return v << bits_frac; - } + if (dither) { + float factor = m << bits_frac; + return _mesa_roundevenf(f * factor); + } else { + uint32_t v = _mesa_roundevenf(f * (float)m); + return v << bits_frac; + } } struct mali_tib_layout { - unsigned int_r, frac_r; - unsigned int_g, frac_g; - unsigned int_b, frac_b; - unsigned int_a, frac_a; + unsigned int_r, frac_r; + unsigned int_g, frac_g; + unsigned int_b, frac_b; + unsigned int_a, frac_a; }; /* clang-format off */ @@ -93,76 +93,77 @@ static const struct mali_tib_layout tib_layouts[] = { /* Raw values are stored as-is but replicated for multisampling */ static void -pan_pack_raw(uint32_t *packed, const union pipe_color_union *color, enum pipe_format format) +pan_pack_raw(uint32_t *packed, const union pipe_color_union *color, + enum pipe_format format) { - union util_color out = { 0 }; - unsigned size = util_format_get_blocksize(format); - assert(size <= 16); + union util_color out = {0}; + unsigned size = util_format_get_blocksize(format); + assert(size <= 16); - util_pack_color(color->f, format, &out); + util_pack_color(color->f, format, &out); - if (size == 1) { - unsigned s = out.ui[0] | (out.ui[0] << 8); - pan_pack_color_32(packed, s | (s << 16)); - } else if (size == 2) - pan_pack_color_32(packed, out.ui[0] | (out.ui[0] << 16)); - else if (size <= 4) - pan_pack_color_32(packed, out.ui[0]); - else if (size <= 8) { - memcpy(packed + 0, out.ui, 8); - memcpy(packed + 2, out.ui, 8); - } else { - memcpy(packed, out.ui, 16); - } + if (size == 1) { + unsigned s = out.ui[0] | (out.ui[0] << 8); + pan_pack_color_32(packed, s | (s << 16)); + } else if (size == 2) + pan_pack_color_32(packed, out.ui[0] | (out.ui[0] << 16)); + else if (size <= 4) + pan_pack_color_32(packed, out.ui[0]); + else if (size <= 8) { + memcpy(packed + 0, out.ui, 8); + memcpy(packed + 2, out.ui, 8); + } else { + memcpy(packed, out.ui, 16); + } } void pan_pack_color(uint32_t *packed, const union pipe_color_union *color, enum pipe_format format, bool dithered) { - /* Set of blendable formats is common across versions. TODO: v9 */ - enum mali_color_buffer_internal_format internal = - panfrost_blendable_formats_v7[format].internal; + /* Set of blendable formats is common across versions. TODO: v9 */ + enum mali_color_buffer_internal_format internal = + panfrost_blendable_formats_v7[format].internal; - if (internal == MALI_COLOR_BUFFER_INTERNAL_FORMAT_RAW_VALUE) { - pan_pack_raw(packed, color, format); - return; - } + if (internal == MALI_COLOR_BUFFER_INTERNAL_FORMAT_RAW_VALUE) { + pan_pack_raw(packed, color, format); + return; + } - /* Saturate to [0, 1] by definition of UNORM. Prevents overflow. */ - float r = SATURATE(color->f[0]); - float g = SATURATE(color->f[1]); - float b = SATURATE(color->f[2]); - float a = SATURATE(color->f[3]); + /* Saturate to [0, 1] by definition of UNORM. Prevents overflow. */ + float r = SATURATE(color->f[0]); + float g = SATURATE(color->f[1]); + float b = SATURATE(color->f[2]); + float a = SATURATE(color->f[3]); - /* Fill in alpha = 1.0 by default */ - if (!util_format_has_alpha(format)) - a = 1.0; + /* Fill in alpha = 1.0 by default */ + if (!util_format_has_alpha(format)) + a = 1.0; - /* Convert colourspace while we still have floats */ - if (util_format_is_srgb(format)) { - r = util_format_linear_to_srgb_float(r); - g = util_format_linear_to_srgb_float(g); - b = util_format_linear_to_srgb_float(b); - } + /* Convert colourspace while we still have floats */ + if (util_format_is_srgb(format)) { + r = util_format_linear_to_srgb_float(r); + g = util_format_linear_to_srgb_float(g); + b = util_format_linear_to_srgb_float(b); + } - /* Look up the layout of the tilebuffer */ - assert(internal < ARRAY_SIZE(tib_layouts)); - struct mali_tib_layout l = tib_layouts[internal]; + /* Look up the layout of the tilebuffer */ + assert(internal < ARRAY_SIZE(tib_layouts)); + struct mali_tib_layout l = tib_layouts[internal]; - unsigned count_r = l.int_r + l.frac_r; - unsigned count_g = l.int_g + l.frac_g + count_r; - unsigned count_b = l.int_b + l.frac_b + count_g; - ASSERTED unsigned count_a = l.int_a + l.frac_a + count_b; + unsigned count_r = l.int_r + l.frac_r; + unsigned count_g = l.int_g + l.frac_g + count_r; + unsigned count_b = l.int_b + l.frac_b + count_g; + ASSERTED unsigned count_a = l.int_a + l.frac_a + count_b; - /* Must fill the word */ - assert(count_a == 32); + /* Must fill the word */ + assert(count_a == 32); - /* Convert the transformed float colour to the given layout */ - uint32_t ur = float_to_fixed(r, l.int_r, l.frac_r, dithered) << 0; - uint32_t ug = float_to_fixed(g, l.int_g, l.frac_g, dithered) << count_r; - uint32_t ub = float_to_fixed(b, l.int_b, l.frac_b, dithered) << count_g; - uint32_t ua = float_to_fixed(a, l.int_a, l.frac_a, dithered) << count_b; + /* Convert the transformed float colour to the given layout */ + uint32_t ur = float_to_fixed(r, l.int_r, l.frac_r, dithered) << 0; + uint32_t ug = float_to_fixed(g, l.int_g, l.frac_g, dithered) << count_r; + uint32_t ub = float_to_fixed(b, l.int_b, l.frac_b, dithered) << count_g; + uint32_t ua = float_to_fixed(a, l.int_a, l.frac_a, dithered) << count_b; - pan_pack_color_32(packed, ur | ug | ub | ua); + pan_pack_color_32(packed, ur | ug | ub | ua); } diff --git a/src/panfrost/lib/pan_cs.c b/src/panfrost/lib/pan_cs.c index 87587b2c931..45b578dc981 100644 --- a/src/panfrost/lib/pan_cs.c +++ b/src/panfrost/lib/pan_cs.c @@ -27,7 +27,6 @@ #include "util/macros.h" - #include "pan_cs.h" #include "pan_encoder.h" #include "pan_texture.h" @@ -35,270 +34,285 @@ static unsigned mod_to_block_fmt(uint64_t mod) { - switch (mod) { - case DRM_FORMAT_MOD_LINEAR: - return MALI_BLOCK_FORMAT_LINEAR; - case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED: - return MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED; - default: + switch (mod) { + case DRM_FORMAT_MOD_LINEAR: + return MALI_BLOCK_FORMAT_LINEAR; + case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED: + return MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED; + default: #if PAN_ARCH >= 5 - if (drm_is_afbc(mod) && !(mod & AFBC_FORMAT_MOD_TILED)) - return MALI_BLOCK_FORMAT_AFBC; + if (drm_is_afbc(mod) && !(mod & AFBC_FORMAT_MOD_TILED)) + return MALI_BLOCK_FORMAT_AFBC; #endif #if PAN_ARCH >= 7 - if (drm_is_afbc(mod) && (mod & AFBC_FORMAT_MOD_TILED)) - return MALI_BLOCK_FORMAT_AFBC_TILED; + if (drm_is_afbc(mod) && (mod & AFBC_FORMAT_MOD_TILED)) + return MALI_BLOCK_FORMAT_AFBC_TILED; #endif - unreachable("Unsupported modifer"); - } + unreachable("Unsupported modifer"); + } } static enum mali_msaa mali_sampling_mode(const struct pan_image_view *view) { - if (view->image->layout.nr_samples > 1) { - assert(view->nr_samples == view->image->layout.nr_samples); - assert(view->image->layout.slices[0].surface_stride != 0); - return MALI_MSAA_LAYERED; - } + if (view->image->layout.nr_samples > 1) { + assert(view->nr_samples == view->image->layout.nr_samples); + assert(view->image->layout.slices[0].surface_stride != 0); + return MALI_MSAA_LAYERED; + } - if (view->nr_samples > view->image->layout.nr_samples) { - assert(view->image->layout.nr_samples == 1); - return MALI_MSAA_AVERAGE; - } + if (view->nr_samples > view->image->layout.nr_samples) { + assert(view->image->layout.nr_samples == 1); + return MALI_MSAA_AVERAGE; + } - assert(view->nr_samples == view->image->layout.nr_samples); - assert(view->nr_samples == 1); + assert(view->nr_samples == view->image->layout.nr_samples); + assert(view->nr_samples == 1); - return MALI_MSAA_SINGLE; + return MALI_MSAA_SINGLE; } #if PAN_ARCH >= 5 static inline enum mali_sample_pattern pan_sample_pattern(unsigned samples) { - switch (samples) { - case 1: return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED; - case 4: return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID; - case 8: return MALI_SAMPLE_PATTERN_D3D_8X_GRID; - case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID; - default: unreachable("Unsupported sample count"); - } + switch (samples) { + case 1: + return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED; + case 4: + return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID; + case 8: + return MALI_SAMPLE_PATTERN_D3D_8X_GRID; + case 16: + return MALI_SAMPLE_PATTERN_D3D_16X_GRID; + default: + unreachable("Unsupported sample count"); + } } #endif int GENX(pan_select_crc_rt)(const struct pan_fb_info *fb, unsigned tile_size) { - /* Disable CRC when the tile size is not 16x16. In the hardware, CRC - * tiles are the same size as the tiles of the framebuffer. However, - * our code only handles 16x16 tiles. Therefore under the current - * implementation, we must disable CRC when 16x16 tiles are not used. - * - * This may hurt performance. However, smaller tile sizes are rare, and - * CRCs are more expensive at smaller tile sizes, reducing the benefit. - * Restricting CRC to 16x16 should work in practice. - */ - if (tile_size != 16 * 16) { - assert(tile_size < 16 * 16); - return -1; - } + /* Disable CRC when the tile size is not 16x16. In the hardware, CRC + * tiles are the same size as the tiles of the framebuffer. However, + * our code only handles 16x16 tiles. Therefore under the current + * implementation, we must disable CRC when 16x16 tiles are not used. + * + * This may hurt performance. However, smaller tile sizes are rare, and + * CRCs are more expensive at smaller tile sizes, reducing the benefit. + * Restricting CRC to 16x16 should work in practice. + */ + if (tile_size != 16 * 16) { + assert(tile_size < 16 * 16); + return -1; + } #if PAN_ARCH <= 6 - if (fb->rt_count == 1 && fb->rts[0].view && !fb->rts[0].discard && - fb->rts[0].view->image->layout.crc) - return 0; + if (fb->rt_count == 1 && fb->rts[0].view && !fb->rts[0].discard && + fb->rts[0].view->image->layout.crc) + return 0; - return -1; + return -1; #else - bool best_rt_valid = false; - int best_rt = -1; + bool best_rt_valid = false; + int best_rt = -1; - for (unsigned i = 0; i < fb->rt_count; i++) { - if (!fb->rts[i].view || fb->rts[0].discard || - !fb->rts[i].view->image->layout.crc) - continue; + for (unsigned i = 0; i < fb->rt_count; i++) { + if (!fb->rts[i].view || fb->rts[0].discard || + !fb->rts[i].view->image->layout.crc) + continue; - bool valid = *(fb->rts[i].crc_valid); - bool full = !fb->extent.minx && !fb->extent.miny && - fb->extent.maxx == (fb->width - 1) && - fb->extent.maxy == (fb->height - 1); - if (!full && !valid) - continue; + bool valid = *(fb->rts[i].crc_valid); + bool full = !fb->extent.minx && !fb->extent.miny && + fb->extent.maxx == (fb->width - 1) && + fb->extent.maxy == (fb->height - 1); + if (!full && !valid) + continue; - if (best_rt < 0 || (valid && !best_rt_valid)) { - best_rt = i; - best_rt_valid = valid; - } + if (best_rt < 0 || (valid && !best_rt_valid)) { + best_rt = i; + best_rt_valid = valid; + } - if (valid) - break; - } + if (valid) + break; + } - return best_rt; + return best_rt; #endif } static enum mali_zs_format translate_zs_format(enum pipe_format in) { - switch (in) { - case PIPE_FORMAT_Z16_UNORM: return MALI_ZS_FORMAT_D16; - case PIPE_FORMAT_Z24_UNORM_S8_UINT: return MALI_ZS_FORMAT_D24S8; - case PIPE_FORMAT_Z24X8_UNORM: return MALI_ZS_FORMAT_D24X8; - case PIPE_FORMAT_Z32_FLOAT: return MALI_ZS_FORMAT_D32; + switch (in) { + case PIPE_FORMAT_Z16_UNORM: + return MALI_ZS_FORMAT_D16; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return MALI_ZS_FORMAT_D24S8; + case PIPE_FORMAT_Z24X8_UNORM: + return MALI_ZS_FORMAT_D24X8; + case PIPE_FORMAT_Z32_FLOAT: + return MALI_ZS_FORMAT_D32; #if PAN_ARCH <= 7 - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: return MALI_ZS_FORMAT_D32_S8X24; + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return MALI_ZS_FORMAT_D32_S8X24; #endif - default: unreachable("Unsupported depth/stencil format."); - } + default: + unreachable("Unsupported depth/stencil format."); + } } #if PAN_ARCH >= 5 static enum mali_s_format translate_s_format(enum pipe_format in) { - switch (in) { - case PIPE_FORMAT_S8_UINT: return MALI_S_FORMAT_S8; - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_X24S8_UINT: - return MALI_S_FORMAT_X24S8; + switch (in) { + case PIPE_FORMAT_S8_UINT: + return MALI_S_FORMAT_S8; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_X24S8_UINT: + return MALI_S_FORMAT_X24S8; #if PAN_ARCH <= 7 - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - case PIPE_FORMAT_S8X24_UINT: - return MALI_S_FORMAT_S8X24; - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return MALI_S_FORMAT_X32_S8X24; + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + case PIPE_FORMAT_S8X24_UINT: + return MALI_S_FORMAT_S8X24; + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return MALI_S_FORMAT_X32_S8X24; #endif - default: - unreachable("Unsupported stencil format."); - } + default: + unreachable("Unsupported stencil format."); + } } static void -pan_prepare_s(const struct pan_fb_info *fb, - struct MALI_ZS_CRC_EXTENSION *ext) +pan_prepare_s(const struct pan_fb_info *fb, struct MALI_ZS_CRC_EXTENSION *ext) { - const struct pan_image_view *s = fb->zs.view.s; + const struct pan_image_view *s = fb->zs.view.s; - if (!s) - return; + if (!s) + return; - unsigned level = s->first_level; + unsigned level = s->first_level; - ext->s_msaa = mali_sampling_mode(s); + ext->s_msaa = mali_sampling_mode(s); - struct pan_surface surf; - pan_iview_get_surface(s, 0, 0, 0, &surf); + struct pan_surface surf; + pan_iview_get_surface(s, 0, 0, 0, &surf); - assert(s->image->layout.modifier == DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED || - s->image->layout.modifier == DRM_FORMAT_MOD_LINEAR); - ext->s_writeback_base = surf.data; - ext->s_writeback_row_stride = s->image->layout.slices[level].row_stride; - ext->s_writeback_surface_stride = - (s->image->layout.nr_samples > 1) ? - s->image->layout.slices[level].surface_stride : 0; - ext->s_block_format = mod_to_block_fmt(s->image->layout.modifier); - ext->s_write_format = translate_s_format(s->format); + assert(s->image->layout.modifier == + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED || + s->image->layout.modifier == DRM_FORMAT_MOD_LINEAR); + ext->s_writeback_base = surf.data; + ext->s_writeback_row_stride = s->image->layout.slices[level].row_stride; + ext->s_writeback_surface_stride = + (s->image->layout.nr_samples > 1) + ? s->image->layout.slices[level].surface_stride + : 0; + ext->s_block_format = mod_to_block_fmt(s->image->layout.modifier); + ext->s_write_format = translate_s_format(s->format); } static void -pan_prepare_zs(const struct pan_fb_info *fb, - struct MALI_ZS_CRC_EXTENSION *ext) +pan_prepare_zs(const struct pan_fb_info *fb, struct MALI_ZS_CRC_EXTENSION *ext) { - const struct pan_image_view *zs = fb->zs.view.zs; + const struct pan_image_view *zs = fb->zs.view.zs; - if (!zs) - return; + if (!zs) + return; - unsigned level = zs->first_level; + unsigned level = zs->first_level; - ext->zs_msaa = mali_sampling_mode(zs); + ext->zs_msaa = mali_sampling_mode(zs); - struct pan_surface surf; - pan_iview_get_surface(zs, 0, 0, 0, &surf); - UNUSED const struct pan_image_slice_layout *slice = &zs->image->layout.slices[level]; + struct pan_surface surf; + pan_iview_get_surface(zs, 0, 0, 0, &surf); + UNUSED const struct pan_image_slice_layout *slice = + &zs->image->layout.slices[level]; - if (drm_is_afbc(zs->image->layout.modifier)) { + if (drm_is_afbc(zs->image->layout.modifier)) { #if PAN_ARCH >= 9 - ext->zs_writeback_base = surf.afbc.header; - ext->zs_writeback_row_stride = slice->row_stride; - /* TODO: surface stride? */ - ext->zs_afbc_body_offset = surf.afbc.body - surf.afbc.header; + ext->zs_writeback_base = surf.afbc.header; + ext->zs_writeback_row_stride = slice->row_stride; + /* TODO: surface stride? */ + ext->zs_afbc_body_offset = surf.afbc.body - surf.afbc.header; - /* TODO: stencil AFBC? */ + /* TODO: stencil AFBC? */ #else #if PAN_ARCH >= 6 - ext->zs_afbc_row_stride = pan_afbc_stride_blocks(zs->image->layout.modifier, slice->row_stride); + ext->zs_afbc_row_stride = + pan_afbc_stride_blocks(zs->image->layout.modifier, slice->row_stride); #else - ext->zs_block_format = MALI_BLOCK_FORMAT_AFBC; - ext->zs_afbc_body_size = 0x1000; - ext->zs_afbc_chunk_size = 9; - ext->zs_afbc_sparse = true; + ext->zs_block_format = MALI_BLOCK_FORMAT_AFBC; + ext->zs_afbc_body_size = 0x1000; + ext->zs_afbc_chunk_size = 9; + ext->zs_afbc_sparse = true; #endif - ext->zs_afbc_header = surf.afbc.header; - ext->zs_afbc_body = surf.afbc.body; + ext->zs_afbc_header = surf.afbc.header; + ext->zs_afbc_body = surf.afbc.body; #endif - } else { - assert(zs->image->layout.modifier == DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED || - zs->image->layout.modifier == DRM_FORMAT_MOD_LINEAR); + } else { + assert(zs->image->layout.modifier == + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED || + zs->image->layout.modifier == DRM_FORMAT_MOD_LINEAR); - /* TODO: Z32F(S8) support, which is always linear */ + /* TODO: Z32F(S8) support, which is always linear */ - ext->zs_writeback_base = surf.data; - ext->zs_writeback_row_stride = - zs->image->layout.slices[level].row_stride; - ext->zs_writeback_surface_stride = - (zs->image->layout.nr_samples > 1) ? - zs->image->layout.slices[level].surface_stride : 0; - } + ext->zs_writeback_base = surf.data; + ext->zs_writeback_row_stride = zs->image->layout.slices[level].row_stride; + ext->zs_writeback_surface_stride = + (zs->image->layout.nr_samples > 1) + ? zs->image->layout.slices[level].surface_stride + : 0; + } - ext->zs_block_format = mod_to_block_fmt(zs->image->layout.modifier); - ext->zs_write_format = translate_zs_format(zs->format); - if (ext->zs_write_format == MALI_ZS_FORMAT_D24S8) - ext->s_writeback_base = ext->zs_writeback_base; + ext->zs_block_format = mod_to_block_fmt(zs->image->layout.modifier); + ext->zs_write_format = translate_zs_format(zs->format); + if (ext->zs_write_format == MALI_ZS_FORMAT_D24S8) + ext->s_writeback_base = ext->zs_writeback_base; } static void pan_prepare_crc(const struct pan_fb_info *fb, int rt_crc, struct MALI_ZS_CRC_EXTENSION *ext) { - if (rt_crc < 0) - return; + if (rt_crc < 0) + return; - assert(rt_crc < fb->rt_count); + assert(rt_crc < fb->rt_count); - const struct pan_image_view *rt = fb->rts[rt_crc].view; - const struct pan_image_slice_layout *slice = &rt->image->layout.slices[rt->first_level]; - ext->crc_base = rt->image->data.bo->ptr.gpu + rt->image->data.offset - + slice->crc.offset; - ext->crc_row_stride = slice->crc.stride; + const struct pan_image_view *rt = fb->rts[rt_crc].view; + const struct pan_image_slice_layout *slice = + &rt->image->layout.slices[rt->first_level]; + ext->crc_base = + rt->image->data.bo->ptr.gpu + rt->image->data.offset + slice->crc.offset; + ext->crc_row_stride = slice->crc.stride; #if PAN_ARCH >= 7 - ext->crc_render_target = rt_crc; + ext->crc_render_target = rt_crc; - if (fb->rts[rt_crc].clear) { - uint32_t clear_val = fb->rts[rt_crc].clear_value[0]; - ext->crc_clear_color = clear_val | 0xc000000000000000 | - (((uint64_t)clear_val & 0xffff) << 32); - } + if (fb->rts[rt_crc].clear) { + uint32_t clear_val = fb->rts[rt_crc].clear_value[0]; + ext->crc_clear_color = clear_val | 0xc000000000000000 | + (((uint64_t)clear_val & 0xffff) << 32); + } #endif } static void -pan_emit_zs_crc_ext(const struct pan_fb_info *fb, int rt_crc, - void *zs_crc_ext) +pan_emit_zs_crc_ext(const struct pan_fb_info *fb, int rt_crc, void *zs_crc_ext) { - pan_pack(zs_crc_ext, ZS_CRC_EXTENSION, cfg) { - pan_prepare_crc(fb, rt_crc, &cfg); - cfg.zs_clean_pixel_write_enable = fb->zs.clear.z || fb->zs.clear.s; - pan_prepare_zs(fb, &cfg); - pan_prepare_s(fb, &cfg); - } + pan_pack(zs_crc_ext, ZS_CRC_EXTENSION, cfg) { + pan_prepare_crc(fb, rt_crc, &cfg); + cfg.zs_clean_pixel_write_enable = fb->zs.clear.z || fb->zs.clear.s; + pan_prepare_zs(fb, &cfg); + pan_prepare_s(fb, &cfg); + } } /* Measure format as it appears in the tile buffer */ @@ -306,33 +320,33 @@ pan_emit_zs_crc_ext(const struct pan_fb_info *fb, int rt_crc, static unsigned pan_bytes_per_pixel_tib(enum pipe_format format) { - if (panfrost_blendable_formats_v7[format].internal) { - /* Blendable formats are always 32-bits in the tile buffer, - * extra bits are used as padding or to dither */ - return 4; - } else { - /* Non-blendable formats are raw, rounded up to the nearest - * power-of-two size */ - unsigned bytes = util_format_get_blocksize(format); - return util_next_power_of_two(bytes); - } + if (panfrost_blendable_formats_v7[format].internal) { + /* Blendable formats are always 32-bits in the tile buffer, + * extra bits are used as padding or to dither */ + return 4; + } else { + /* Non-blendable formats are raw, rounded up to the nearest + * power-of-two size */ + unsigned bytes = util_format_get_blocksize(format); + return util_next_power_of_two(bytes); + } } static unsigned pan_cbuf_bytes_per_pixel(const struct pan_fb_info *fb) { - unsigned sum = 0; + unsigned sum = 0; - for (int cb = 0; cb < fb->rt_count; ++cb) { - const struct pan_image_view *rt = fb->rts[cb].view; + for (int cb = 0; cb < fb->rt_count; ++cb) { + const struct pan_image_view *rt = fb->rts[cb].view; - if (!rt) - continue; + if (!rt) + continue; - sum += pan_bytes_per_pixel_tib(rt->format) * rt->nr_samples; - } + sum += pan_bytes_per_pixel_tib(rt->format) * rt->nr_samples; + } - return sum; + return sum; } /* @@ -346,10 +360,10 @@ pan_cbuf_bytes_per_pixel(const struct pan_fb_info *fb) static unsigned pan_select_max_tile_size(unsigned tile_buffer_bytes, unsigned bytes_per_pixel) { - assert(util_is_power_of_two_nonzero(tile_buffer_bytes)); - assert(tile_buffer_bytes >= 1024); + assert(util_is_power_of_two_nonzero(tile_buffer_bytes)); + assert(tile_buffer_bytes >= 1024); - return tile_buffer_bytes >> util_logbase2_ceil(bytes_per_pixel); + return tile_buffer_bytes >> util_logbase2_ceil(bytes_per_pixel); } static enum mali_color_format @@ -382,61 +396,63 @@ static void pan_rt_init_format(const struct pan_image_view *rt, struct MALI_RENDER_TARGET *cfg) { - /* Explode details on the format */ + /* Explode details on the format */ - const struct util_format_description *desc = - util_format_description(rt->format); + const struct util_format_description *desc = + util_format_description(rt->format); - /* The swizzle for rendering is inverted from texturing */ + /* The swizzle for rendering is inverted from texturing */ - unsigned char swizzle[4] = { - PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W, - }; + unsigned char swizzle[4] = { + PIPE_SWIZZLE_X, + PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, + PIPE_SWIZZLE_W, + }; - /* Fill in accordingly, defaulting to 8-bit UNORM */ + /* Fill in accordingly, defaulting to 8-bit UNORM */ - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) - cfg->srgb = true; + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + cfg->srgb = true; - struct pan_blendable_format fmt = panfrost_blendable_formats_v7[rt->format]; + struct pan_blendable_format fmt = panfrost_blendable_formats_v7[rt->format]; - if (fmt.internal) { - cfg->internal_format = fmt.internal; - cfg->writeback_format = fmt.writeback; - panfrost_invert_swizzle(desc->swizzle, swizzle); - } else { - /* Construct RAW internal/writeback, where internal is - * specified logarithmically (round to next power-of-two). - * Offset specified from RAW8, where 8 = 2^3 */ + if (fmt.internal) { + cfg->internal_format = fmt.internal; + cfg->writeback_format = fmt.writeback; + panfrost_invert_swizzle(desc->swizzle, swizzle); + } else { + /* Construct RAW internal/writeback, where internal is + * specified logarithmically (round to next power-of-two). + * Offset specified from RAW8, where 8 = 2^3 */ - unsigned bits = desc->block.bits; - unsigned offset = util_logbase2_ceil(bits) - 3; - assert(offset <= 4); + unsigned bits = desc->block.bits; + unsigned offset = util_logbase2_ceil(bits) - 3; + assert(offset <= 4); - cfg->internal_format = - MALI_COLOR_BUFFER_INTERNAL_FORMAT_RAW8 + offset; + cfg->internal_format = MALI_COLOR_BUFFER_INTERNAL_FORMAT_RAW8 + offset; - cfg->writeback_format = pan_mfbd_raw_format(bits); - } + cfg->writeback_format = pan_mfbd_raw_format(bits); + } - cfg->swizzle = panfrost_translate_swizzle_4(swizzle); + cfg->swizzle = panfrost_translate_swizzle_4(swizzle); } #if PAN_ARCH >= 9 enum mali_afbc_compression_mode pan_afbc_compression_mode(enum pipe_format format) { - /* There's a special case for texturing the stencil part from a combined - * depth/stencil texture, handle it separately. - */ - if (format == PIPE_FORMAT_X24S8_UINT) - return MALI_AFBC_COMPRESSION_MODE_X24S8; + /* There's a special case for texturing the stencil part from a combined + * depth/stencil texture, handle it separately. + */ + if (format == PIPE_FORMAT_X24S8_UINT) + return MALI_AFBC_COMPRESSION_MODE_X24S8; - /* Otherwise, map canonical formats to the hardware enum. This only - * needs to handle the subset of formats returned by - * panfrost_afbc_format. - */ - /* clang-format off */ + /* Otherwise, map canonical formats to the hardware enum. This only + * needs to handle the subset of formats returned by + * panfrost_afbc_format. + */ + /* clang-format off */ switch (panfrost_afbc_format(PAN_ARCH, format)) { case PAN_AFBC_MODE_R8: return MALI_AFBC_COMPRESSION_MODE_R8; case PAN_AFBC_MODE_R8G8: return MALI_AFBC_COMPRESSION_MODE_R8G8; @@ -450,194 +466,186 @@ pan_afbc_compression_mode(enum pipe_format format) case PAN_AFBC_MODE_S8: return MALI_AFBC_COMPRESSION_MODE_S8; case PAN_AFBC_MODE_INVALID: unreachable("Invalid AFBC format"); } - /* clang-format on */ + /* clang-format on */ - unreachable("all AFBC formats handled"); + unreachable("all AFBC formats handled"); } #endif static void -pan_prepare_rt(const struct pan_fb_info *fb, unsigned idx, - unsigned cbuf_offset, +pan_prepare_rt(const struct pan_fb_info *fb, unsigned idx, unsigned cbuf_offset, struct MALI_RENDER_TARGET *cfg) { - cfg->clean_pixel_write_enable = fb->rts[idx].clear; - cfg->internal_buffer_offset = cbuf_offset; - if (fb->rts[idx].clear) { - cfg->clear.color_0 = fb->rts[idx].clear_value[0]; - cfg->clear.color_1 = fb->rts[idx].clear_value[1]; - cfg->clear.color_2 = fb->rts[idx].clear_value[2]; - cfg->clear.color_3 = fb->rts[idx].clear_value[3]; - } + cfg->clean_pixel_write_enable = fb->rts[idx].clear; + cfg->internal_buffer_offset = cbuf_offset; + if (fb->rts[idx].clear) { + cfg->clear.color_0 = fb->rts[idx].clear_value[0]; + cfg->clear.color_1 = fb->rts[idx].clear_value[1]; + cfg->clear.color_2 = fb->rts[idx].clear_value[2]; + cfg->clear.color_3 = fb->rts[idx].clear_value[3]; + } - const struct pan_image_view *rt = fb->rts[idx].view; - if (!rt || fb->rts[idx].discard) { - cfg->internal_format = MALI_COLOR_BUFFER_INTERNAL_FORMAT_R8G8B8A8; - cfg->internal_buffer_offset = cbuf_offset; + const struct pan_image_view *rt = fb->rts[idx].view; + if (!rt || fb->rts[idx].discard) { + cfg->internal_format = MALI_COLOR_BUFFER_INTERNAL_FORMAT_R8G8B8A8; + cfg->internal_buffer_offset = cbuf_offset; #if PAN_ARCH >= 7 - cfg->writeback_block_format = MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED; - cfg->dithering_enable = true; + cfg->writeback_block_format = MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED; + cfg->dithering_enable = true; #endif - return; - } + return; + } - cfg->write_enable = true; - cfg->dithering_enable = true; + cfg->write_enable = true; + cfg->dithering_enable = true; - unsigned level = rt->first_level; - assert(rt->last_level == rt->first_level); - assert(rt->last_layer == rt->first_layer); + unsigned level = rt->first_level; + assert(rt->last_level == rt->first_level); + assert(rt->last_layer == rt->first_layer); - int row_stride = rt->image->layout.slices[level].row_stride; + int row_stride = rt->image->layout.slices[level].row_stride; - /* Only set layer_stride for layered MSAA rendering */ + /* Only set layer_stride for layered MSAA rendering */ - unsigned layer_stride = - (rt->image->layout.nr_samples > 1) ? - rt->image->layout.slices[level].surface_stride : 0; + unsigned layer_stride = (rt->image->layout.nr_samples > 1) + ? rt->image->layout.slices[level].surface_stride + : 0; - cfg->writeback_msaa = mali_sampling_mode(rt); + cfg->writeback_msaa = mali_sampling_mode(rt); - pan_rt_init_format(rt, cfg); + pan_rt_init_format(rt, cfg); - cfg->writeback_block_format = mod_to_block_fmt(rt->image->layout.modifier); + cfg->writeback_block_format = mod_to_block_fmt(rt->image->layout.modifier); - struct pan_surface surf; - pan_iview_get_surface(rt, 0, 0, 0, &surf); + struct pan_surface surf; + pan_iview_get_surface(rt, 0, 0, 0, &surf); - if (drm_is_afbc(rt->image->layout.modifier)) { + if (drm_is_afbc(rt->image->layout.modifier)) { #if PAN_ARCH >= 9 - if (rt->image->layout.modifier & AFBC_FORMAT_MOD_YTR) - cfg->afbc.yuv_transform = true; + if (rt->image->layout.modifier & AFBC_FORMAT_MOD_YTR) + cfg->afbc.yuv_transform = true; - cfg->afbc.wide_block = panfrost_afbc_is_wide(rt->image->layout.modifier); - cfg->afbc.header = surf.afbc.header; - cfg->afbc.body_offset = surf.afbc.body - surf.afbc.header; - assert(surf.afbc.body >= surf.afbc.header); + cfg->afbc.wide_block = panfrost_afbc_is_wide(rt->image->layout.modifier); + cfg->afbc.header = surf.afbc.header; + cfg->afbc.body_offset = surf.afbc.body - surf.afbc.header; + assert(surf.afbc.body >= surf.afbc.header); - cfg->afbc.compression_mode = pan_afbc_compression_mode(rt->format); - cfg->afbc.row_stride = row_stride; + cfg->afbc.compression_mode = pan_afbc_compression_mode(rt->format); + cfg->afbc.row_stride = row_stride; #else - const struct pan_image_slice_layout *slice = &rt->image->layout.slices[level]; + const struct pan_image_slice_layout *slice = + &rt->image->layout.slices[level]; #if PAN_ARCH >= 6 - cfg->afbc.row_stride = pan_afbc_stride_blocks(rt->image->layout.modifier, slice->row_stride); - cfg->afbc.afbc_wide_block_enable = - panfrost_afbc_is_wide(rt->image->layout.modifier); + cfg->afbc.row_stride = + pan_afbc_stride_blocks(rt->image->layout.modifier, slice->row_stride); + cfg->afbc.afbc_wide_block_enable = + panfrost_afbc_is_wide(rt->image->layout.modifier); #else - cfg->afbc.chunk_size = 9; - cfg->afbc.sparse = true; - cfg->afbc.body_size = slice->afbc.body_size; + cfg->afbc.chunk_size = 9; + cfg->afbc.sparse = true; + cfg->afbc.body_size = slice->afbc.body_size; #endif - cfg->afbc.header = surf.afbc.header; - cfg->afbc.body = surf.afbc.body; + cfg->afbc.header = surf.afbc.header; + cfg->afbc.body = surf.afbc.body; - if (rt->image->layout.modifier & AFBC_FORMAT_MOD_YTR) - cfg->afbc.yuv_transform_enable = true; + if (rt->image->layout.modifier & AFBC_FORMAT_MOD_YTR) + cfg->afbc.yuv_transform_enable = true; #endif - } else { - assert(rt->image->layout.modifier == DRM_FORMAT_MOD_LINEAR || - rt->image->layout.modifier == DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED); - cfg->rgb.base = surf.data; - cfg->rgb.row_stride = row_stride; - cfg->rgb.surface_stride = layer_stride; - } + } else { + assert(rt->image->layout.modifier == DRM_FORMAT_MOD_LINEAR || + rt->image->layout.modifier == + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED); + cfg->rgb.base = surf.data; + cfg->rgb.row_stride = row_stride; + cfg->rgb.surface_stride = layer_stride; + } } #endif void -GENX(pan_emit_tls)(const struct pan_tls_info *info, - void *out) +GENX(pan_emit_tls)(const struct pan_tls_info *info, void *out) { - pan_pack(out, LOCAL_STORAGE, cfg) { - if (info->tls.size) { - unsigned shift = - panfrost_get_stack_shift(info->tls.size); + pan_pack(out, LOCAL_STORAGE, cfg) { + if (info->tls.size) { + unsigned shift = panfrost_get_stack_shift(info->tls.size); - cfg.tls_size = shift; + cfg.tls_size = shift; #if PAN_ARCH >= 9 - /* For now, always use packed TLS addressing. This is - * better for the cache and requires no fix up code in - * the shader. We may need to revisit this someday for - * OpenCL generic pointer support. - */ - cfg.tls_address_mode = MALI_ADDRESS_MODE_PACKED; + /* For now, always use packed TLS addressing. This is + * better for the cache and requires no fix up code in + * the shader. We may need to revisit this someday for + * OpenCL generic pointer support. + */ + cfg.tls_address_mode = MALI_ADDRESS_MODE_PACKED; - assert((info->tls.ptr & 4095) == 0); - cfg.tls_base_pointer = info->tls.ptr >> 8; + assert((info->tls.ptr & 4095) == 0); + cfg.tls_base_pointer = info->tls.ptr >> 8; #else - cfg.tls_base_pointer = info->tls.ptr; + cfg.tls_base_pointer = info->tls.ptr; #endif - } + } - if (info->wls.size) { - assert(!(info->wls.ptr & 4095)); - assert((info->wls.ptr & 0xffffffff00000000ULL) == ((info->wls.ptr + info->wls.size - 1) & 0xffffffff00000000ULL)); - cfg.wls_base_pointer = info->wls.ptr; - unsigned wls_size = pan_wls_adjust_size(info->wls.size); - cfg.wls_instances = info->wls.instances; - cfg.wls_size_scale = util_logbase2(wls_size) + 1; - } else { - cfg.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; - } - } + if (info->wls.size) { + assert(!(info->wls.ptr & 4095)); + assert((info->wls.ptr & 0xffffffff00000000ULL) == + ((info->wls.ptr + info->wls.size - 1) & 0xffffffff00000000ULL)); + cfg.wls_base_pointer = info->wls.ptr; + unsigned wls_size = pan_wls_adjust_size(info->wls.size); + cfg.wls_instances = info->wls.instances; + cfg.wls_size_scale = util_logbase2(wls_size) + 1; + } else { + cfg.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; + } + } } #if PAN_ARCH <= 5 static void pan_emit_midgard_tiler(const struct panfrost_device *dev, const struct pan_fb_info *fb, - const struct pan_tiler_context *tiler_ctx, - void *out) + const struct pan_tiler_context *tiler_ctx, void *out) { - bool hierarchy = !dev->model->quirks.no_hierarchical_tiling; + bool hierarchy = !dev->model->quirks.no_hierarchical_tiling; - assert(tiler_ctx->midgard.polygon_list->ptr.gpu); + assert(tiler_ctx->midgard.polygon_list->ptr.gpu); - pan_pack(out, TILER_CONTEXT, cfg) { - unsigned header_size; + pan_pack(out, TILER_CONTEXT, cfg) { + unsigned header_size; - if (tiler_ctx->midgard.disable) { - cfg.hierarchy_mask = - hierarchy ? - MALI_MIDGARD_TILER_DISABLED : - MALI_MIDGARD_TILER_USER; - header_size = MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE; - cfg.polygon_list_size = header_size + (hierarchy ? 0 : 4); - cfg.heap_start = tiler_ctx->midgard.polygon_list->ptr.gpu; - cfg.heap_end = tiler_ctx->midgard.polygon_list->ptr.gpu; - } else { - cfg.hierarchy_mask = - panfrost_choose_hierarchy_mask(fb->width, - fb->height, - 1, hierarchy); - header_size = panfrost_tiler_header_size(fb->width, - fb->height, - cfg.hierarchy_mask, - hierarchy); - cfg.polygon_list_size = - panfrost_tiler_full_size(fb->width, fb->height, - cfg.hierarchy_mask, - hierarchy); - cfg.heap_start = dev->tiler_heap->ptr.gpu; - cfg.heap_end = dev->tiler_heap->ptr.gpu + dev->tiler_heap->size; - } + if (tiler_ctx->midgard.disable) { + cfg.hierarchy_mask = + hierarchy ? MALI_MIDGARD_TILER_DISABLED : MALI_MIDGARD_TILER_USER; + header_size = MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE; + cfg.polygon_list_size = header_size + (hierarchy ? 0 : 4); + cfg.heap_start = tiler_ctx->midgard.polygon_list->ptr.gpu; + cfg.heap_end = tiler_ctx->midgard.polygon_list->ptr.gpu; + } else { + cfg.hierarchy_mask = + panfrost_choose_hierarchy_mask(fb->width, fb->height, 1, hierarchy); + header_size = panfrost_tiler_header_size( + fb->width, fb->height, cfg.hierarchy_mask, hierarchy); + cfg.polygon_list_size = panfrost_tiler_full_size( + fb->width, fb->height, cfg.hierarchy_mask, hierarchy); + cfg.heap_start = dev->tiler_heap->ptr.gpu; + cfg.heap_end = dev->tiler_heap->ptr.gpu + dev->tiler_heap->size; + } - cfg.polygon_list = tiler_ctx->midgard.polygon_list->ptr.gpu; - cfg.polygon_list_body = cfg.polygon_list + header_size; - } + cfg.polygon_list = tiler_ctx->midgard.polygon_list->ptr.gpu; + cfg.polygon_list_body = cfg.polygon_list + header_size; + } } #endif #if PAN_ARCH >= 5 static void -pan_emit_rt(const struct pan_fb_info *fb, - unsigned idx, unsigned cbuf_offset, void *out) +pan_emit_rt(const struct pan_fb_info *fb, unsigned idx, unsigned cbuf_offset, + void *out) { - pan_pack(out, RENDER_TARGET, cfg) { - pan_prepare_rt(fb, idx, cbuf_offset, &cfg); - } + pan_pack(out, RENDER_TARGET, cfg) { + pan_prepare_rt(fb, idx, cbuf_offset, &cfg); + } } #if PAN_ARCH >= 6 @@ -650,12 +658,13 @@ pan_emit_rt(const struct pan_fb_info *fb, * ignore, this cannot affect correctness, only performance */ static enum mali_pre_post_frame_shader_mode -pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode, bool force_clean_tile) +pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode, + bool force_clean_tile) { - if (force_clean_tile && mode == MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT) - return MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS; - else - return mode; + if (force_clean_tile && mode == MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT) + return MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS; + else + return mode; } /* Regardless of clean_tile_write_enable, the hardware writes clean tiles if @@ -665,349 +674,345 @@ pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode, bool force_ static bool pan_force_clean_write_rt(const struct pan_image_view *rt, unsigned tile_size) { - if (!drm_is_afbc(rt->image->layout.modifier)) - return false; + if (!drm_is_afbc(rt->image->layout.modifier)) + return false; - unsigned superblock = panfrost_afbc_superblock_width(rt->image->layout.modifier); + unsigned superblock = + panfrost_afbc_superblock_width(rt->image->layout.modifier); - assert(superblock >= 16); - assert(tile_size <= 16*16); + assert(superblock >= 16); + assert(tile_size <= 16 * 16); - /* Tile size and superblock differ unless they are both 16x16 */ - return !(superblock == 16 && tile_size == 16*16); + /* Tile size and superblock differ unless they are both 16x16 */ + return !(superblock == 16 && tile_size == 16 * 16); } static bool pan_force_clean_write(const struct pan_fb_info *fb, unsigned tile_size) { - /* Maximum tile size */ - assert(tile_size <= 16*16); + /* Maximum tile size */ + assert(tile_size <= 16 * 16); - for (unsigned i = 0; i < fb->rt_count; ++i) { - if (fb->rts[i].view && !fb->rts[i].discard && - pan_force_clean_write_rt(fb->rts[i].view, tile_size)) - return true; - } + for (unsigned i = 0; i < fb->rt_count; ++i) { + if (fb->rts[i].view && !fb->rts[i].discard && + pan_force_clean_write_rt(fb->rts[i].view, tile_size)) + return true; + } - if (fb->zs.view.zs && !fb->zs.discard.z && - pan_force_clean_write_rt(fb->zs.view.zs, tile_size)) - return true; + if (fb->zs.view.zs && !fb->zs.discard.z && + pan_force_clean_write_rt(fb->zs.view.zs, tile_size)) + return true; - if (fb->zs.view.s && !fb->zs.discard.s && - pan_force_clean_write_rt(fb->zs.view.s, tile_size)) - return true; + if (fb->zs.view.s && !fb->zs.discard.s && + pan_force_clean_write_rt(fb->zs.view.s, tile_size)) + return true; - return false; + return false; } #endif unsigned GENX(pan_emit_fbd)(const struct panfrost_device *dev, - const struct pan_fb_info *fb, - const struct pan_tls_info *tls, - const struct pan_tiler_context *tiler_ctx, - void *out) + const struct pan_fb_info *fb, const struct pan_tls_info *tls, + const struct pan_tiler_context *tiler_ctx, void *out) { - unsigned tags = MALI_FBD_TAG_IS_MFBD; - void *fbd = out; - void *rtd = out + pan_size(FRAMEBUFFER); + unsigned tags = MALI_FBD_TAG_IS_MFBD; + void *fbd = out; + void *rtd = out + pan_size(FRAMEBUFFER); #if PAN_ARCH <= 5 - GENX(pan_emit_tls)(tls, - pan_section_ptr(fbd, FRAMEBUFFER, LOCAL_STORAGE)); + GENX(pan_emit_tls)(tls, pan_section_ptr(fbd, FRAMEBUFFER, LOCAL_STORAGE)); #endif - unsigned bytes_per_pixel = pan_cbuf_bytes_per_pixel(fb); - unsigned tile_size = pan_select_max_tile_size(dev->optimal_tib_size, - bytes_per_pixel); + unsigned bytes_per_pixel = pan_cbuf_bytes_per_pixel(fb); + unsigned tile_size = + pan_select_max_tile_size(dev->optimal_tib_size, bytes_per_pixel); - /* Clamp tile size to hardware limits */ - tile_size = MIN2(tile_size, 16 * 16); - assert(tile_size >= 4 * 4); + /* Clamp tile size to hardware limits */ + tile_size = MIN2(tile_size, 16 * 16); + assert(tile_size >= 4 * 4); - /* Colour buffer allocations must be 1K aligned. */ - unsigned cbuf_allocation = ALIGN_POT(bytes_per_pixel * tile_size, 1024); - assert(cbuf_allocation <= dev->optimal_tib_size && "tile too big"); + /* Colour buffer allocations must be 1K aligned. */ + unsigned cbuf_allocation = ALIGN_POT(bytes_per_pixel * tile_size, 1024); + assert(cbuf_allocation <= dev->optimal_tib_size && "tile too big"); - int crc_rt = GENX(pan_select_crc_rt)(fb, tile_size); - bool has_zs_crc_ext = (fb->zs.view.zs || fb->zs.view.s || crc_rt >= 0); + int crc_rt = GENX(pan_select_crc_rt)(fb, tile_size); + bool has_zs_crc_ext = (fb->zs.view.zs || fb->zs.view.s || crc_rt >= 0); - pan_section_pack(fbd, FRAMEBUFFER, PARAMETERS, cfg) { + pan_section_pack(fbd, FRAMEBUFFER, PARAMETERS, cfg) { #if PAN_ARCH >= 6 - bool force_clean_write = pan_force_clean_write(fb, tile_size); + bool force_clean_write = pan_force_clean_write(fb, tile_size); - cfg.sample_locations = - panfrost_sample_positions(dev, pan_sample_pattern(fb->nr_samples)); - cfg.pre_frame_0 = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0], force_clean_write); - cfg.pre_frame_1 = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[1], force_clean_write); - cfg.post_frame = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[2], force_clean_write); - cfg.frame_shader_dcds = fb->bifrost.pre_post.dcds.gpu; - cfg.tiler = tiler_ctx->bifrost; + cfg.sample_locations = + panfrost_sample_positions(dev, pan_sample_pattern(fb->nr_samples)); + cfg.pre_frame_0 = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0], + force_clean_write); + cfg.pre_frame_1 = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[1], + force_clean_write); + cfg.post_frame = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[2], + force_clean_write); + cfg.frame_shader_dcds = fb->bifrost.pre_post.dcds.gpu; + cfg.tiler = tiler_ctx->bifrost; #endif - cfg.width = fb->width; - cfg.height = fb->height; - cfg.bound_max_x = fb->width - 1; - cfg.bound_max_y = fb->height - 1; + cfg.width = fb->width; + cfg.height = fb->height; + cfg.bound_max_x = fb->width - 1; + cfg.bound_max_y = fb->height - 1; - cfg.effective_tile_size = tile_size; - cfg.tie_break_rule = MALI_TIE_BREAK_RULE_MINUS_180_IN_0_OUT; - cfg.render_target_count = MAX2(fb->rt_count, 1); + cfg.effective_tile_size = tile_size; + cfg.tie_break_rule = MALI_TIE_BREAK_RULE_MINUS_180_IN_0_OUT; + cfg.render_target_count = MAX2(fb->rt_count, 1); - /* Default to 24 bit depth if there's no surface. */ - cfg.z_internal_format = - fb->zs.view.zs ? - panfrost_get_z_internal_format(fb->zs.view.zs->format) : - MALI_Z_INTERNAL_FORMAT_D24; + /* Default to 24 bit depth if there's no surface. */ + cfg.z_internal_format = + fb->zs.view.zs ? panfrost_get_z_internal_format(fb->zs.view.zs->format) + : MALI_Z_INTERNAL_FORMAT_D24; - cfg.z_clear = fb->zs.clear_value.depth; - cfg.s_clear = fb->zs.clear_value.stencil; - cfg.color_buffer_allocation = cbuf_allocation; - cfg.sample_count = fb->nr_samples; - cfg.sample_pattern = pan_sample_pattern(fb->nr_samples); - cfg.z_write_enable = (fb->zs.view.zs && !fb->zs.discard.z); - cfg.s_write_enable = (fb->zs.view.s && !fb->zs.discard.s); - cfg.has_zs_crc_extension = has_zs_crc_ext; + cfg.z_clear = fb->zs.clear_value.depth; + cfg.s_clear = fb->zs.clear_value.stencil; + cfg.color_buffer_allocation = cbuf_allocation; + cfg.sample_count = fb->nr_samples; + cfg.sample_pattern = pan_sample_pattern(fb->nr_samples); + cfg.z_write_enable = (fb->zs.view.zs && !fb->zs.discard.z); + cfg.s_write_enable = (fb->zs.view.s && !fb->zs.discard.s); + cfg.has_zs_crc_extension = has_zs_crc_ext; - if (crc_rt >= 0) { - bool *valid = fb->rts[crc_rt].crc_valid; - bool full = !fb->extent.minx && !fb->extent.miny && - fb->extent.maxx == (fb->width - 1) && - fb->extent.maxy == (fb->height - 1); + if (crc_rt >= 0) { + bool *valid = fb->rts[crc_rt].crc_valid; + bool full = !fb->extent.minx && !fb->extent.miny && + fb->extent.maxx == (fb->width - 1) && + fb->extent.maxy == (fb->height - 1); - cfg.crc_read_enable = *valid; + cfg.crc_read_enable = *valid; - /* If the data is currently invalid, still write CRC - * data if we are doing a full write, so that it is - * valid for next time. */ - cfg.crc_write_enable = *valid || full; + /* If the data is currently invalid, still write CRC + * data if we are doing a full write, so that it is + * valid for next time. */ + cfg.crc_write_enable = *valid || full; - *valid |= full; - } + *valid |= full; + } #if PAN_ARCH >= 9 - cfg.point_sprite_coord_origin_max_y = fb->sprite_coord_origin; - cfg.first_provoking_vertex = fb->first_provoking_vertex; + cfg.point_sprite_coord_origin_max_y = fb->sprite_coord_origin; + cfg.first_provoking_vertex = fb->first_provoking_vertex; #endif - } + } #if PAN_ARCH >= 6 - pan_section_pack(fbd, FRAMEBUFFER, PADDING, padding); + pan_section_pack(fbd, FRAMEBUFFER, PADDING, padding) + ; #else - pan_emit_midgard_tiler(dev, fb, tiler_ctx, - pan_section_ptr(fbd, FRAMEBUFFER, TILER)); + pan_emit_midgard_tiler(dev, fb, tiler_ctx, + pan_section_ptr(fbd, FRAMEBUFFER, TILER)); - /* All weights set to 0, nothing to do here */ - pan_section_pack(fbd, FRAMEBUFFER, TILER_WEIGHTS, w); + /* All weights set to 0, nothing to do here */ + pan_section_pack(fbd, FRAMEBUFFER, TILER_WEIGHTS, w) + ; #endif - if (has_zs_crc_ext) { - pan_emit_zs_crc_ext(fb, crc_rt, - out + pan_size(FRAMEBUFFER)); - rtd += pan_size(ZS_CRC_EXTENSION); - tags |= MALI_FBD_TAG_HAS_ZS_RT; - } + if (has_zs_crc_ext) { + pan_emit_zs_crc_ext(fb, crc_rt, out + pan_size(FRAMEBUFFER)); + rtd += pan_size(ZS_CRC_EXTENSION); + tags |= MALI_FBD_TAG_HAS_ZS_RT; + } - unsigned rt_count = MAX2(fb->rt_count, 1); - unsigned cbuf_offset = 0; - for (unsigned i = 0; i < rt_count; i++) { - pan_emit_rt(fb, i, cbuf_offset, rtd); - rtd += pan_size(RENDER_TARGET); - if (!fb->rts[i].view) - continue; + unsigned rt_count = MAX2(fb->rt_count, 1); + unsigned cbuf_offset = 0; + for (unsigned i = 0; i < rt_count; i++) { + pan_emit_rt(fb, i, cbuf_offset, rtd); + rtd += pan_size(RENDER_TARGET); + if (!fb->rts[i].view) + continue; - cbuf_offset += pan_bytes_per_pixel_tib(fb->rts[i].view->format) * - tile_size * fb->rts[i].view->image->layout.nr_samples; + cbuf_offset += pan_bytes_per_pixel_tib(fb->rts[i].view->format) * + tile_size * fb->rts[i].view->image->layout.nr_samples; - if (i != crc_rt) - *(fb->rts[i].crc_valid) = false; - } - tags |= MALI_POSITIVE(MAX2(fb->rt_count, 1)) << 2; + if (i != crc_rt) + *(fb->rts[i].crc_valid) = false; + } + tags |= MALI_POSITIVE(MAX2(fb->rt_count, 1)) << 2; - return tags; + return tags; } #else /* PAN_ARCH == 4 */ unsigned GENX(pan_emit_fbd)(const struct panfrost_device *dev, - const struct pan_fb_info *fb, - const struct pan_tls_info *tls, - const struct pan_tiler_context *tiler_ctx, - void *fbd) + const struct pan_fb_info *fb, const struct pan_tls_info *tls, + const struct pan_tiler_context *tiler_ctx, void *fbd) { - assert(fb->rt_count <= 1); + assert(fb->rt_count <= 1); - GENX(pan_emit_tls)(tls, - pan_section_ptr(fbd, FRAMEBUFFER, - LOCAL_STORAGE)); - pan_section_pack(fbd, FRAMEBUFFER, PARAMETERS, cfg) { - cfg.bound_max_x = fb->width - 1; - cfg.bound_max_y = fb->height - 1; - cfg.dithering_enable = true; - cfg.clean_pixel_write_enable = true; - cfg.tie_break_rule = MALI_TIE_BREAK_RULE_MINUS_180_IN_0_OUT; - if (fb->rts[0].clear) { - cfg.clear_color_0 = fb->rts[0].clear_value[0]; - cfg.clear_color_1 = fb->rts[0].clear_value[1]; - cfg.clear_color_2 = fb->rts[0].clear_value[2]; - cfg.clear_color_3 = fb->rts[0].clear_value[3]; - } + GENX(pan_emit_tls)(tls, pan_section_ptr(fbd, FRAMEBUFFER, LOCAL_STORAGE)); + pan_section_pack(fbd, FRAMEBUFFER, PARAMETERS, cfg) { + cfg.bound_max_x = fb->width - 1; + cfg.bound_max_y = fb->height - 1; + cfg.dithering_enable = true; + cfg.clean_pixel_write_enable = true; + cfg.tie_break_rule = MALI_TIE_BREAK_RULE_MINUS_180_IN_0_OUT; + if (fb->rts[0].clear) { + cfg.clear_color_0 = fb->rts[0].clear_value[0]; + cfg.clear_color_1 = fb->rts[0].clear_value[1]; + cfg.clear_color_2 = fb->rts[0].clear_value[2]; + cfg.clear_color_3 = fb->rts[0].clear_value[3]; + } - if (fb->zs.clear.z) - cfg.z_clear = fb->zs.clear_value.depth; + if (fb->zs.clear.z) + cfg.z_clear = fb->zs.clear_value.depth; - if (fb->zs.clear.s) - cfg.s_clear = fb->zs.clear_value.stencil; + if (fb->zs.clear.s) + cfg.s_clear = fb->zs.clear_value.stencil; - if (fb->rt_count && fb->rts[0].view) { - const struct pan_image_view *rt = fb->rts[0].view; + if (fb->rt_count && fb->rts[0].view) { + const struct pan_image_view *rt = fb->rts[0].view; - const struct util_format_description *desc = - util_format_description(rt->format); + const struct util_format_description *desc = + util_format_description(rt->format); - /* The swizzle for rendering is inverted from texturing */ - unsigned char swizzle[4]; - panfrost_invert_swizzle(desc->swizzle, swizzle); - cfg.swizzle = panfrost_translate_swizzle_4(swizzle); + /* The swizzle for rendering is inverted from texturing */ + unsigned char swizzle[4]; + panfrost_invert_swizzle(desc->swizzle, swizzle); + cfg.swizzle = panfrost_translate_swizzle_4(swizzle); - struct pan_blendable_format fmt = panfrost_blendable_formats_v7[rt->format]; - if (fmt.internal) { - cfg.internal_format = fmt.internal; - cfg.color_writeback_format = fmt.writeback; - } else { - unreachable("raw formats not finished for SFBD"); - } + struct pan_blendable_format fmt = + panfrost_blendable_formats_v7[rt->format]; + if (fmt.internal) { + cfg.internal_format = fmt.internal; + cfg.color_writeback_format = fmt.writeback; + } else { + unreachable("raw formats not finished for SFBD"); + } - unsigned level = rt->first_level; - struct pan_surface surf; + unsigned level = rt->first_level; + struct pan_surface surf; - pan_iview_get_surface(rt, 0, 0, 0, &surf); + pan_iview_get_surface(rt, 0, 0, 0, &surf); - cfg.color_write_enable = !fb->rts[0].discard; - cfg.color_writeback.base = surf.data; - cfg.color_writeback.row_stride = - rt->image->layout.slices[level].row_stride; + cfg.color_write_enable = !fb->rts[0].discard; + cfg.color_writeback.base = surf.data; + cfg.color_writeback.row_stride = + rt->image->layout.slices[level].row_stride; - cfg.color_block_format = mod_to_block_fmt(rt->image->layout.modifier); - assert(cfg.color_block_format == MALI_BLOCK_FORMAT_LINEAR || - cfg.color_block_format == MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED); + cfg.color_block_format = mod_to_block_fmt(rt->image->layout.modifier); + assert(cfg.color_block_format == MALI_BLOCK_FORMAT_LINEAR || + cfg.color_block_format == + MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED); - if (rt->image->layout.crc) { - const struct pan_image_slice_layout *slice = - &rt->image->layout.slices[level]; + if (rt->image->layout.crc) { + const struct pan_image_slice_layout *slice = + &rt->image->layout.slices[level]; - cfg.crc_buffer.row_stride = slice->crc.stride; - cfg.crc_buffer.base = rt->image->data.bo->ptr.gpu + - rt->image->data.offset + - slice->crc.offset; - } - } + cfg.crc_buffer.row_stride = slice->crc.stride; + cfg.crc_buffer.base = rt->image->data.bo->ptr.gpu + + rt->image->data.offset + slice->crc.offset; + } + } - if (fb->zs.view.zs) { - const struct pan_image_view *zs = fb->zs.view.zs; - unsigned level = zs->first_level; - struct pan_surface surf; + if (fb->zs.view.zs) { + const struct pan_image_view *zs = fb->zs.view.zs; + unsigned level = zs->first_level; + struct pan_surface surf; - pan_iview_get_surface(zs, 0, 0, 0, &surf); + pan_iview_get_surface(zs, 0, 0, 0, &surf); - cfg.zs_write_enable = !fb->zs.discard.z; - cfg.zs_writeback.base = surf.data; - cfg.zs_writeback.row_stride = - zs->image->layout.slices[level].row_stride; - cfg.zs_block_format = mod_to_block_fmt(zs->image->layout.modifier); - assert(cfg.zs_block_format == MALI_BLOCK_FORMAT_LINEAR || - cfg.zs_block_format == MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED); + cfg.zs_write_enable = !fb->zs.discard.z; + cfg.zs_writeback.base = surf.data; + cfg.zs_writeback.row_stride = + zs->image->layout.slices[level].row_stride; + cfg.zs_block_format = mod_to_block_fmt(zs->image->layout.modifier); + assert(cfg.zs_block_format == MALI_BLOCK_FORMAT_LINEAR || + cfg.zs_block_format == MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED); - cfg.zs_format = translate_zs_format(zs->format); - } + cfg.zs_format = translate_zs_format(zs->format); + } - cfg.sample_count = fb->nr_samples; + cfg.sample_count = fb->nr_samples; - if (fb->rt_count) - cfg.msaa = mali_sampling_mode(fb->rts[0].view); - } + if (fb->rt_count) + cfg.msaa = mali_sampling_mode(fb->rts[0].view); + } - pan_emit_midgard_tiler(dev, fb, tiler_ctx, - pan_section_ptr(fbd, FRAMEBUFFER, TILER)); + pan_emit_midgard_tiler(dev, fb, tiler_ctx, + pan_section_ptr(fbd, FRAMEBUFFER, TILER)); - /* All weights set to 0, nothing to do here */ - pan_section_pack(fbd, FRAMEBUFFER, TILER_WEIGHTS, w); + /* All weights set to 0, nothing to do here */ + pan_section_pack(fbd, FRAMEBUFFER, TILER_WEIGHTS, w) + ; - pan_section_pack(fbd, FRAMEBUFFER, PADDING_1, padding); - pan_section_pack(fbd, FRAMEBUFFER, PADDING_2, padding); - return 0; + pan_section_pack(fbd, FRAMEBUFFER, PADDING_1, padding) + ; + pan_section_pack(fbd, FRAMEBUFFER, PADDING_2, padding) + ; + return 0; } #endif #if PAN_ARCH >= 6 void -GENX(pan_emit_tiler_heap)(const struct panfrost_device *dev, - void *out) +GENX(pan_emit_tiler_heap)(const struct panfrost_device *dev, void *out) { - pan_pack(out, TILER_HEAP, heap) { - heap.size = dev->tiler_heap->size; - heap.base = dev->tiler_heap->ptr.gpu; - heap.bottom = dev->tiler_heap->ptr.gpu; - heap.top = dev->tiler_heap->ptr.gpu + dev->tiler_heap->size; - } + pan_pack(out, TILER_HEAP, heap) { + heap.size = dev->tiler_heap->size; + heap.base = dev->tiler_heap->ptr.gpu; + heap.bottom = dev->tiler_heap->ptr.gpu; + heap.top = dev->tiler_heap->ptr.gpu + dev->tiler_heap->size; + } } void -GENX(pan_emit_tiler_ctx)(const struct panfrost_device *dev, - unsigned fb_width, unsigned fb_height, - unsigned nr_samples, - bool first_provoking_vertex, - mali_ptr heap, - void *out) +GENX(pan_emit_tiler_ctx)(const struct panfrost_device *dev, unsigned fb_width, + unsigned fb_height, unsigned nr_samples, + bool first_provoking_vertex, mali_ptr heap, void *out) { - unsigned max_levels = dev->tiler_features.max_levels; - assert(max_levels >= 2); + unsigned max_levels = dev->tiler_features.max_levels; + assert(max_levels >= 2); - pan_pack(out, TILER_CONTEXT, tiler) { - /* TODO: Select hierarchy mask more effectively */ - tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28; + pan_pack(out, TILER_CONTEXT, tiler) { + /* TODO: Select hierarchy mask more effectively */ + tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28; - /* For large framebuffers, disable the smallest bin size to - * avoid pathological tiler memory usage. Required to avoid OOM - * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on - * Mali-G57. - */ - if (MAX2(fb_width, fb_height) >= 4096) - tiler.hierarchy_mask &= ~1; + /* For large framebuffers, disable the smallest bin size to + * avoid pathological tiler memory usage. Required to avoid OOM + * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on + * Mali-G57. + */ + if (MAX2(fb_width, fb_height) >= 4096) + tiler.hierarchy_mask &= ~1; - tiler.fb_width = fb_width; - tiler.fb_height = fb_height; - tiler.heap = heap; - tiler.sample_pattern = pan_sample_pattern(nr_samples); + tiler.fb_width = fb_width; + tiler.fb_height = fb_height; + tiler.heap = heap; + tiler.sample_pattern = pan_sample_pattern(nr_samples); #if PAN_ARCH >= 9 - tiler.first_provoking_vertex = first_provoking_vertex; + tiler.first_provoking_vertex = first_provoking_vertex; #endif - } + } } #endif void -GENX(pan_emit_fragment_job)(const struct pan_fb_info *fb, - mali_ptr fbd, +GENX(pan_emit_fragment_job)(const struct pan_fb_info *fb, mali_ptr fbd, void *out) { - pan_section_pack(out, FRAGMENT_JOB, HEADER, header) { - header.type = MALI_JOB_TYPE_FRAGMENT; - header.index = 1; - } + pan_section_pack(out, FRAGMENT_JOB, HEADER, header) { + header.type = MALI_JOB_TYPE_FRAGMENT; + header.index = 1; + } - pan_section_pack(out, FRAGMENT_JOB, PAYLOAD, payload) { - payload.bound_min_x = fb->extent.minx >> MALI_TILE_SHIFT; - payload.bound_min_y = fb->extent.miny >> MALI_TILE_SHIFT; - payload.bound_max_x = fb->extent.maxx >> MALI_TILE_SHIFT; - payload.bound_max_y = fb->extent.maxy >> MALI_TILE_SHIFT; - payload.framebuffer = fbd; + pan_section_pack(out, FRAGMENT_JOB, PAYLOAD, payload) { + payload.bound_min_x = fb->extent.minx >> MALI_TILE_SHIFT; + payload.bound_min_y = fb->extent.miny >> MALI_TILE_SHIFT; + payload.bound_max_x = fb->extent.maxx >> MALI_TILE_SHIFT; + payload.bound_max_y = fb->extent.maxy >> MALI_TILE_SHIFT; + payload.framebuffer = fbd; #if PAN_ARCH >= 5 - if (fb->tile_map.base) { - payload.has_tile_enable_map = true; - payload.tile_enable_map = fb->tile_map.base; - payload.tile_enable_map_row_stride = fb->tile_map.stride; - } + if (fb->tile_map.base) { + payload.has_tile_enable_map = true; + payload.tile_enable_map = fb->tile_map.base; + payload.tile_enable_map_row_stride = fb->tile_map.stride; + } #endif - } + } } diff --git a/src/panfrost/lib/pan_cs.h b/src/panfrost/lib/pan_cs.h index 8186102e5c0..c192ac52a5f 100644 --- a/src/panfrost/lib/pan_cs.h +++ b/src/panfrost/lib/pan_cs.h @@ -33,152 +33,140 @@ #include "pan_texture.h" struct pan_compute_dim { - uint32_t x, y, z; + uint32_t x, y, z; }; struct pan_fb_color_attachment { - const struct pan_image_view *view; - bool *crc_valid; - bool clear; - bool preload; - bool discard; - uint32_t clear_value[4]; + const struct pan_image_view *view; + bool *crc_valid; + bool clear; + bool preload; + bool discard; + uint32_t clear_value[4]; }; struct pan_fb_zs_attachment { - struct { - const struct pan_image_view *zs, *s; - } view; + struct { + const struct pan_image_view *zs, *s; + } view; - struct { - bool z, s; - } clear; + struct { + bool z, s; + } clear; - struct { - bool z, s; - } discard; + struct { + bool z, s; + } discard; - struct { - bool z, s; - } preload; + struct { + bool z, s; + } preload; - struct { - float depth; - uint8_t stencil; - } clear_value; + struct { + float depth; + uint8_t stencil; + } clear_value; }; struct pan_tiler_context { - union { - mali_ptr bifrost; - struct { - bool disable; - struct panfrost_bo *polygon_list; - } midgard; - }; + union { + mali_ptr bifrost; + struct { + bool disable; + struct panfrost_bo *polygon_list; + } midgard; + }; }; struct pan_tls_info { - struct { - mali_ptr ptr; - unsigned size; - } tls; + struct { + mali_ptr ptr; + unsigned size; + } tls; - struct { - unsigned instances; - mali_ptr ptr; - unsigned size; - } wls; + struct { + unsigned instances; + mali_ptr ptr; + unsigned size; + } wls; }; struct pan_fb_bifrost_info { - struct { - struct panfrost_ptr dcds; - unsigned modes[3]; - } pre_post; + struct { + struct panfrost_ptr dcds; + unsigned modes[3]; + } pre_post; }; struct pan_fb_info { - unsigned width, height; - struct { - /* Max values are inclusive */ - unsigned minx, miny, maxx, maxy; - } extent; - unsigned nr_samples; - unsigned rt_count; - struct pan_fb_color_attachment rts[8]; - struct pan_fb_zs_attachment zs; + unsigned width, height; + struct { + /* Max values are inclusive */ + unsigned minx, miny, maxx, maxy; + } extent; + unsigned nr_samples; + unsigned rt_count; + struct pan_fb_color_attachment rts[8]; + struct pan_fb_zs_attachment zs; - struct { - unsigned stride; - mali_ptr base; - } tile_map; + struct { + unsigned stride; + mali_ptr base; + } tile_map; - union { - struct pan_fb_bifrost_info bifrost; - }; + union { + struct pan_fb_bifrost_info bifrost; + }; - /* Only used on Valhall */ - bool sprite_coord_origin; - bool first_provoking_vertex; + /* Only used on Valhall */ + bool sprite_coord_origin; + bool first_provoking_vertex; }; static inline unsigned pan_wls_instances(const struct pan_compute_dim *dim) { - return util_next_power_of_two(dim->x) * - util_next_power_of_two(dim->y) * - util_next_power_of_two(dim->z); + return util_next_power_of_two(dim->x) * util_next_power_of_two(dim->y) * + util_next_power_of_two(dim->z); } static inline unsigned pan_wls_adjust_size(unsigned wls_size) { - return util_next_power_of_two(MAX2(wls_size, 128)); + return util_next_power_of_two(MAX2(wls_size, 128)); } static inline unsigned pan_wls_mem_size(const struct panfrost_device *dev, - const struct pan_compute_dim *dim, - unsigned wls_size) + const struct pan_compute_dim *dim, unsigned wls_size) { - unsigned instances = pan_wls_instances(dim); + unsigned instances = pan_wls_instances(dim); - return pan_wls_adjust_size(wls_size) * instances * dev->core_id_range; + return pan_wls_adjust_size(wls_size) * instances * dev->core_id_range; } #ifdef PAN_ARCH -void -GENX(pan_emit_tls)(const struct pan_tls_info *info, - void *out); +void GENX(pan_emit_tls)(const struct pan_tls_info *info, void *out); -int -GENX(pan_select_crc_rt)(const struct pan_fb_info *fb, unsigned tile_size); +int GENX(pan_select_crc_rt)(const struct pan_fb_info *fb, unsigned tile_size); -unsigned -GENX(pan_emit_fbd)(const struct panfrost_device *dev, - const struct pan_fb_info *fb, - const struct pan_tls_info *tls, - const struct pan_tiler_context *tiler_ctx, - void *out); +unsigned GENX(pan_emit_fbd)(const struct panfrost_device *dev, + const struct pan_fb_info *fb, + const struct pan_tls_info *tls, + const struct pan_tiler_context *tiler_ctx, + void *out); #if PAN_ARCH >= 6 -void -GENX(pan_emit_tiler_heap)(const struct panfrost_device *dev, - void *out); +void GENX(pan_emit_tiler_heap)(const struct panfrost_device *dev, void *out); -void -GENX(pan_emit_tiler_ctx)(const struct panfrost_device *dev, - unsigned fb_width, unsigned fb_height, - unsigned nr_samples, bool first_provoking_vertex, - mali_ptr heap, - void *out); +void GENX(pan_emit_tiler_ctx)(const struct panfrost_device *dev, + unsigned fb_width, unsigned fb_height, + unsigned nr_samples, bool first_provoking_vertex, + mali_ptr heap, void *out); #endif -void -GENX(pan_emit_fragment_job)(const struct pan_fb_info *fb, - mali_ptr fbd, - void *out); +void GENX(pan_emit_fragment_job)(const struct pan_fb_info *fb, mali_ptr fbd, + void *out); #endif /* ifdef PAN_ARCH */ #endif diff --git a/src/panfrost/lib/pan_device.h b/src/panfrost/lib/pan_device.h index 52c029bfe9f..e441f1e4bc3 100644 --- a/src/panfrost/lib/pan_device.h +++ b/src/panfrost/lib/pan_device.h @@ -32,10 +32,10 @@ #include #include "renderonly/renderonly.h" -#include "util/u_dynarray.h" #include "util/bitset.h" #include "util/list.h" #include "util/sparse_array.h" +#include "util/u_dynarray.h" #include "panfrost/util/pan_ir.h" #include "pan_pool.h" @@ -61,193 +61,185 @@ extern "C" { #define NR_BO_CACHE_BUCKETS (MAX_BO_CACHE_BUCKET - MIN_BO_CACHE_BUCKET + 1) struct pan_blitter { - struct { - struct pan_pool *pool; - struct hash_table *blit; - struct hash_table *blend; - pthread_mutex_t lock; - } shaders; - struct { - struct pan_pool *pool; - struct hash_table *rsds; - pthread_mutex_t lock; - } rsds; + struct { + struct pan_pool *pool; + struct hash_table *blit; + struct hash_table *blend; + pthread_mutex_t lock; + } shaders; + struct { + struct pan_pool *pool; + struct hash_table *rsds; + pthread_mutex_t lock; + } rsds; }; struct pan_blend_shaders { - struct hash_table *shaders; - pthread_mutex_t lock; + struct hash_table *shaders; + pthread_mutex_t lock; }; struct pan_indirect_dispatch { - struct panfrost_ubo_push push; - struct panfrost_bo *bin; - struct panfrost_bo *descs; + struct panfrost_ubo_push push; + struct panfrost_bo *bin; + struct panfrost_bo *descs; }; /** Implementation-defined tiler features */ struct panfrost_tiler_features { - /** Number of bytes per tiler bin */ - unsigned bin_size; + /** Number of bytes per tiler bin */ + unsigned bin_size; - /** Maximum number of levels that may be simultaneously enabled. - * Invariant: bitcount(hierarchy_mask) <= max_levels */ - unsigned max_levels; + /** Maximum number of levels that may be simultaneously enabled. + * Invariant: bitcount(hierarchy_mask) <= max_levels */ + unsigned max_levels; }; struct panfrost_model { - /* GPU ID */ - uint32_t gpu_id; + /* GPU ID */ + uint32_t gpu_id; - /* Marketing name for the GPU, used as the GL_RENDERER */ - const char *name; + /* Marketing name for the GPU, used as the GL_RENDERER */ + const char *name; - /* Set of associated performance counters */ - const char *performance_counters; + /* Set of associated performance counters */ + const char *performance_counters; - /* Minimum GPU revision required for anisotropic filtering. ~0 and 0 - * means "no revisions support anisotropy" and "all revisions support - * anistropy" respectively -- so checking for anisotropy is simply - * comparing the reivsion. - */ - uint32_t min_rev_anisotropic; + /* Minimum GPU revision required for anisotropic filtering. ~0 and 0 + * means "no revisions support anisotropy" and "all revisions support + * anistropy" respectively -- so checking for anisotropy is simply + * comparing the reivsion. + */ + uint32_t min_rev_anisotropic; - /* Default tilebuffer size in bytes for the model. */ - unsigned tilebuffer_size; + /* Default tilebuffer size in bytes for the model. */ + unsigned tilebuffer_size; - struct { - /* The GPU lacks the capability for hierarchical tiling, without - * an "Advanced Tiling Unit", instead requiring a single bin - * size for the entire framebuffer be selected by the driver - */ - bool no_hierarchical_tiling; - } quirks; + struct { + /* The GPU lacks the capability for hierarchical tiling, without + * an "Advanced Tiling Unit", instead requiring a single bin + * size for the entire framebuffer be selected by the driver + */ + bool no_hierarchical_tiling; + } quirks; }; struct panfrost_device { - /* For ralloc */ - void *memctx; + /* For ralloc */ + void *memctx; - int fd; + int fd; - /* Properties of the GPU in use */ - unsigned arch; - unsigned gpu_id; - unsigned revision; + /* Properties of the GPU in use */ + unsigned arch; + unsigned gpu_id; + unsigned revision; - /* Number of shader cores */ - unsigned core_count; + /* Number of shader cores */ + unsigned core_count; - /* Range of core IDs, equal to the maximum core ID + 1. Satisfies - * core_id_range >= core_count. - */ - unsigned core_id_range; + /* Range of core IDs, equal to the maximum core ID + 1. Satisfies + * core_id_range >= core_count. + */ + unsigned core_id_range; - /* Maximum tilebuffer size in bytes for optimal performance. */ - unsigned optimal_tib_size; + /* Maximum tilebuffer size in bytes for optimal performance. */ + unsigned optimal_tib_size; - unsigned thread_tls_alloc; - struct panfrost_tiler_features tiler_features; - const struct panfrost_model *model; - bool has_afbc; + unsigned thread_tls_alloc; + struct panfrost_tiler_features tiler_features; + const struct panfrost_model *model; + bool has_afbc; - /* Table of formats, indexed by a PIPE format */ - const struct panfrost_format *formats; + /* Table of formats, indexed by a PIPE format */ + const struct panfrost_format *formats; - /* Bitmask of supported compressed texture formats */ - uint32_t compressed_formats; + /* Bitmask of supported compressed texture formats */ + uint32_t compressed_formats; - /* debug flags, see pan_util.h how to interpret */ - unsigned debug; + /* debug flags, see pan_util.h how to interpret */ + unsigned debug; - drmVersionPtr kernel_version; + drmVersionPtr kernel_version; - struct renderonly *ro; + struct renderonly *ro; - pthread_mutex_t bo_map_lock; - struct util_sparse_array bo_map; + pthread_mutex_t bo_map_lock; + struct util_sparse_array bo_map; - struct { - pthread_mutex_t lock; + struct { + pthread_mutex_t lock; - /* List containing all cached BOs sorted in LRU (Least - * Recently Used) order. This allows us to quickly evict BOs - * that are more than 1 second old. - */ - struct list_head lru; + /* List containing all cached BOs sorted in LRU (Least + * Recently Used) order. This allows us to quickly evict BOs + * that are more than 1 second old. + */ + struct list_head lru; - /* The BO cache is a set of buckets with power-of-two sizes - * ranging from 2^12 (4096, the page size) to - * 2^(12 + MAX_BO_CACHE_BUCKETS). - * Each bucket is a linked list of free panfrost_bo objects. */ + /* The BO cache is a set of buckets with power-of-two sizes + * ranging from 2^12 (4096, the page size) to + * 2^(12 + MAX_BO_CACHE_BUCKETS). + * Each bucket is a linked list of free panfrost_bo objects. */ - struct list_head buckets[NR_BO_CACHE_BUCKETS]; - } bo_cache; + struct list_head buckets[NR_BO_CACHE_BUCKETS]; + } bo_cache; - struct pan_blitter blitter; - struct pan_blend_shaders blend_shaders; - struct pan_indirect_dispatch indirect_dispatch; + struct pan_blitter blitter; + struct pan_blend_shaders blend_shaders; + struct pan_indirect_dispatch indirect_dispatch; - /* Tiler heap shared across all tiler jobs, allocated against the - * device since there's only a single tiler. Since this is invisible to - * the CPU, it's okay for multiple contexts to reference it - * simultaneously; by keeping on the device struct, we eliminate a - * costly per-context allocation. */ + /* Tiler heap shared across all tiler jobs, allocated against the + * device since there's only a single tiler. Since this is invisible to + * the CPU, it's okay for multiple contexts to reference it + * simultaneously; by keeping on the device struct, we eliminate a + * costly per-context allocation. */ - struct panfrost_bo *tiler_heap; + struct panfrost_bo *tiler_heap; - /* The tiler heap is shared by all contexts, and is written by tiler - * jobs and read by fragment job. We need to ensure that a - * vertex/tiler job chain from one context is not inserted between - * the vertex/tiler and fragment job of another context, otherwise - * we end up with tiler heap corruption. - */ - pthread_mutex_t submit_lock; + /* The tiler heap is shared by all contexts, and is written by tiler + * jobs and read by fragment job. We need to ensure that a + * vertex/tiler job chain from one context is not inserted between + * the vertex/tiler and fragment job of another context, otherwise + * we end up with tiler heap corruption. + */ + pthread_mutex_t submit_lock; - /* Sample positions are preloaded into a write-once constant buffer, - * such that they can be referenced fore free later. Needed - * unconditionally on Bifrost, and useful for sharing with Midgard */ + /* Sample positions are preloaded into a write-once constant buffer, + * such that they can be referenced fore free later. Needed + * unconditionally on Bifrost, and useful for sharing with Midgard */ - struct panfrost_bo *sample_positions; + struct panfrost_bo *sample_positions; }; -void -panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev); +void panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev); -void -panfrost_close_device(struct panfrost_device *dev); +void panfrost_close_device(struct panfrost_device *dev); -bool -panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt); +bool panfrost_supports_compressed_format(struct panfrost_device *dev, + unsigned fmt); -void -panfrost_upload_sample_positions(struct panfrost_device *dev); +void panfrost_upload_sample_positions(struct panfrost_device *dev); -mali_ptr -panfrost_sample_positions(const struct panfrost_device *dev, - enum mali_sample_pattern pattern); -void -panfrost_query_sample_position( - enum mali_sample_pattern pattern, - unsigned sample_idx, - float *out); +mali_ptr panfrost_sample_positions(const struct panfrost_device *dev, + enum mali_sample_pattern pattern); +void panfrost_query_sample_position(enum mali_sample_pattern pattern, + unsigned sample_idx, float *out); -unsigned -panfrost_query_l2_slices(const struct panfrost_device *dev); +unsigned panfrost_query_l2_slices(const struct panfrost_device *dev); static inline struct panfrost_bo * pan_lookup_bo(struct panfrost_device *dev, uint32_t gem_handle) { - return (struct panfrost_bo *)util_sparse_array_get(&dev->bo_map, gem_handle); + return (struct panfrost_bo *)util_sparse_array_get(&dev->bo_map, gem_handle); } static inline bool pan_is_bifrost(const struct panfrost_device *dev) { - return dev->arch >= 6 && dev->arch <= 7; + return dev->arch >= 6 && dev->arch <= 7; } -const struct panfrost_model * panfrost_get_model(uint32_t gpu_id); +const struct panfrost_model *panfrost_get_model(uint32_t gpu_id); #if defined(__cplusplus) } // extern "C" diff --git a/src/panfrost/lib/pan_earlyzs.c b/src/panfrost/lib/pan_earlyzs.c index da27cf906ad..d3c82c1dc55 100644 --- a/src/panfrost/lib/pan_earlyzs.c +++ b/src/panfrost/lib/pan_earlyzs.c @@ -32,10 +32,10 @@ static enum pan_earlyzs best_early_mode(bool zs_always_passes) { - if (zs_always_passes) - return PAN_EARLYZS_WEAK_EARLY; - else - return PAN_EARLYZS_FORCE_EARLY; + if (zs_always_passes) + return PAN_EARLYZS_WEAK_EARLY; + else + return PAN_EARLYZS_FORCE_EARLY; } /* @@ -44,59 +44,56 @@ best_early_mode(bool zs_always_passes) * lookup table, synchronized with pan_earlyzs_get. */ static struct pan_earlyzs_state -analyze(const struct pan_shader_info *s, - bool writes_zs_or_oq, - bool alpha_to_coverage, - bool zs_always_passes) +analyze(const struct pan_shader_info *s, bool writes_zs_or_oq, + bool alpha_to_coverage, bool zs_always_passes) { - /* If the shader writes depth or stencil, all depth/stencil tests must - * be deferred until the value is known after the ZS_EMIT instruction, - * if present. ZS_EMIT must precede ATEST, so the value is known when - * ATEST executes, justifying the late test/update. - */ - bool shader_writes_zs = (s->fs.writes_depth || s->fs.writes_stencil); - bool late_update = shader_writes_zs; - bool late_kill = shader_writes_zs; + /* If the shader writes depth or stencil, all depth/stencil tests must + * be deferred until the value is known after the ZS_EMIT instruction, + * if present. ZS_EMIT must precede ATEST, so the value is known when + * ATEST executes, justifying the late test/update. + */ + bool shader_writes_zs = (s->fs.writes_depth || s->fs.writes_stencil); + bool late_update = shader_writes_zs; + bool late_kill = shader_writes_zs; - /* Late coverage updates are required if the coverage mask depends on - * the results of the shader. Discards are implemented as coverage mask - * updates and must be considered. Strictly, depth/stencil writes may - * also update the coverage mask, but these already force late updates. - */ - bool late_coverage = s->fs.writes_coverage || - s->fs.can_discard || - alpha_to_coverage; + /* Late coverage updates are required if the coverage mask depends on + * the results of the shader. Discards are implemented as coverage mask + * updates and must be considered. Strictly, depth/stencil writes may + * also update the coverage mask, but these already force late updates. + */ + bool late_coverage = + s->fs.writes_coverage || s->fs.can_discard || alpha_to_coverage; - /* Late coverage mask updates may affect the value written to the - * depth/stencil buffer (if a pixel is discarded entirely). However, - * they do not affect depth/stencil testing. So they may only matter if - * depth or stencil is written. - * - * That dependency does mean late coverage mask updates require late - * depth/stencil updates. - * - * Similarly, occlusion queries count samples that pass the - * depth/stencil tests, so occlusion queries with late coverage also - * require a late update. - */ - late_update |= (late_coverage && writes_zs_or_oq); + /* Late coverage mask updates may affect the value written to the + * depth/stencil buffer (if a pixel is discarded entirely). However, + * they do not affect depth/stencil testing. So they may only matter if + * depth or stencil is written. + * + * That dependency does mean late coverage mask updates require late + * depth/stencil updates. + * + * Similarly, occlusion queries count samples that pass the + * depth/stencil tests, so occlusion queries with late coverage also + * require a late update. + */ + late_update |= (late_coverage && writes_zs_or_oq); - /* Side effects require late depth/stencil tests to ensure the shader - * isn't killed before the side effects execute. - */ - late_kill |= s->writes_global; + /* Side effects require late depth/stencil tests to ensure the shader + * isn't killed before the side effects execute. + */ + late_kill |= s->writes_global; - /* Finally, the shader may override and force early fragment tests */ - late_update &= !s->fs.early_fragment_tests; - late_kill &= !s->fs.early_fragment_tests; + /* Finally, the shader may override and force early fragment tests */ + late_update &= !s->fs.early_fragment_tests; + late_kill &= !s->fs.early_fragment_tests; - /* Collect results */ - enum pan_earlyzs early_mode = best_early_mode(zs_always_passes); + /* Collect results */ + enum pan_earlyzs early_mode = best_early_mode(zs_always_passes); - return (struct pan_earlyzs_state) { - .update = late_update ? PAN_EARLYZS_FORCE_LATE : early_mode, - .kill = late_kill ? PAN_EARLYZS_FORCE_LATE : early_mode, - }; + return (struct pan_earlyzs_state){ + .update = late_update ? PAN_EARLYZS_FORCE_LATE : early_mode, + .kill = late_kill ? PAN_EARLYZS_FORCE_LATE : early_mode, + }; } /* @@ -106,14 +103,14 @@ analyze(const struct pan_shader_info *s, struct pan_earlyzs_lut pan_earlyzs_analyze(const struct pan_shader_info *s) { - struct pan_earlyzs_lut lut; + struct pan_earlyzs_lut lut; - for (unsigned v0 = 0; v0 < 2; ++v0) { - for (unsigned v1 = 0; v1 < 2; ++v1) { - for (unsigned v2 = 0; v2 < 2; ++v2) - lut.states[v0][v1][v2] = analyze(s, v0, v1, v2); - } - } + for (unsigned v0 = 0; v0 < 2; ++v0) { + for (unsigned v1 = 0; v1 < 2; ++v1) { + for (unsigned v2 = 0; v2 < 2; ++v2) + lut.states[v0][v1][v2] = analyze(s, v0, v1, v2); + } + } - return lut; + return lut; } diff --git a/src/panfrost/lib/pan_earlyzs.h b/src/panfrost/lib/pan_earlyzs.h index f0a0af496c9..fdc626b00aa 100644 --- a/src/panfrost/lib/pan_earlyzs.h +++ b/src/panfrost/lib/pan_earlyzs.h @@ -32,21 +32,21 @@ extern "C" { /* Matches hardware Pixel Kill enum on Bifrost and Valhall */ enum pan_earlyzs { - PAN_EARLYZS_FORCE_EARLY = 0, - PAN_EARLYZS_WEAK_EARLY = 2, - PAN_EARLYZS_FORCE_LATE = 3 + PAN_EARLYZS_FORCE_EARLY = 0, + PAN_EARLYZS_WEAK_EARLY = 2, + PAN_EARLYZS_FORCE_LATE = 3 }; /* Early-ZS pair. */ struct pan_earlyzs_state { - /* Z/S test and update */ - enum pan_earlyzs update : 2; + /* Z/S test and update */ + enum pan_earlyzs update : 2; - /* Pixel kill */ - enum pan_earlyzs kill : 2; + /* Pixel kill */ + enum pan_earlyzs kill : 2; - /* So it fits in a byte */ - unsigned padding : 4; + /* So it fits in a byte */ + unsigned padding : 4; }; /* Internal lookup table. Users should treat as an opaque structure and only @@ -54,7 +54,7 @@ struct pan_earlyzs_state { * for definition of the arrays. */ struct pan_earlyzs_lut { - struct pan_earlyzs_state states[2][2][2]; + struct pan_earlyzs_state states[2][2][2]; }; /* @@ -62,11 +62,10 @@ struct pan_earlyzs_lut { * defined inline in the header. */ static inline struct pan_earlyzs_state -pan_earlyzs_get(struct pan_earlyzs_lut lut, - bool writes_zs_or_oq, bool alpha_to_coverage, - bool zs_always_passes) +pan_earlyzs_get(struct pan_earlyzs_lut lut, bool writes_zs_or_oq, + bool alpha_to_coverage, bool zs_always_passes) { - return lut.states[writes_zs_or_oq][alpha_to_coverage][zs_always_passes]; + return lut.states[writes_zs_or_oq][alpha_to_coverage][zs_always_passes]; } struct pan_shader_info; diff --git a/src/panfrost/lib/pan_encoder.h b/src/panfrost/lib/pan_encoder.h index 68349996cde..8a3018b763c 100644 --- a/src/panfrost/lib/pan_encoder.h +++ b/src/panfrost/lib/pan_encoder.h @@ -30,23 +30,21 @@ #include "util/macros.h" #include +#include "genxml/gen_macros.h" #include "util/format/u_format.h" #include "pan_bo.h" -#include "genxml/gen_macros.h" #include "pan_device.h" /* Tiler structure size computation */ -unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); +unsigned panfrost_tiler_header_size(unsigned width, unsigned height, + unsigned mask, bool hierarchy); -unsigned -panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); +unsigned panfrost_tiler_full_size(unsigned width, unsigned height, + unsigned mask, bool hierarchy); -unsigned -panfrost_choose_hierarchy_mask( - unsigned width, unsigned height, - unsigned vertex_count, bool hierarchy); +unsigned panfrost_choose_hierarchy_mask(unsigned width, unsigned height, + unsigned vertex_count, bool hierarchy); #if defined(PAN_ARCH) && PAN_ARCH <= 5 static inline unsigned @@ -54,36 +52,34 @@ panfrost_tiler_get_polygon_list_size(const struct panfrost_device *dev, unsigned fb_width, unsigned fb_height, bool has_draws) { - if (!has_draws) - return MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE + 4; + if (!has_draws) + return MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE + 4; - bool hierarchy = !dev->model->quirks.no_hierarchical_tiling; - unsigned hierarchy_mask = - panfrost_choose_hierarchy_mask(fb_width, fb_height, 1, hierarchy); + bool hierarchy = !dev->model->quirks.no_hierarchical_tiling; + unsigned hierarchy_mask = + panfrost_choose_hierarchy_mask(fb_width, fb_height, 1, hierarchy); - return panfrost_tiler_full_size(fb_width, fb_height, hierarchy_mask, hierarchy) + - panfrost_tiler_header_size(fb_width, fb_height, hierarchy_mask, hierarchy); + return panfrost_tiler_full_size(fb_width, fb_height, hierarchy_mask, + hierarchy) + + panfrost_tiler_header_size(fb_width, fb_height, hierarchy_mask, + hierarchy); } #endif /* Stack sizes */ -unsigned -panfrost_get_stack_shift(unsigned stack_size); +unsigned panfrost_get_stack_shift(unsigned stack_size); -unsigned -panfrost_get_total_stack_size( - unsigned thread_size, - unsigned threads_per_core, - unsigned core_id_range); +unsigned panfrost_get_total_stack_size(unsigned thread_size, + unsigned threads_per_core, + unsigned core_id_range); /* Attributes / instancing */ -unsigned -panfrost_padded_vertex_count(unsigned vertex_count); +unsigned panfrost_padded_vertex_count(unsigned vertex_count); -unsigned -panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags); +unsigned panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, + unsigned *extra_flags); #ifdef PAN_ARCH /* Records for gl_VertexID and gl_InstanceID use special encodings on Midgard */ @@ -91,41 +87,38 @@ panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned #if PAN_ARCH <= 5 static inline void panfrost_vertex_id(unsigned padded_count, - struct mali_attribute_buffer_packed *attr, - bool instanced) + struct mali_attribute_buffer_packed *attr, bool instanced) { - pan_pack(attr, ATTRIBUTE_VERTEX_ID, cfg) { - if (instanced) { - cfg.divisor_r = __builtin_ctz(padded_count); - cfg.divisor_p = padded_count >> (cfg.divisor_r + 1); - } else { - /* Large values so the modulo is a no-op */ - cfg.divisor_r = 0x1F; - cfg.divisor_p = 0x4; - } - } + pan_pack(attr, ATTRIBUTE_VERTEX_ID, cfg) { + if (instanced) { + cfg.divisor_r = __builtin_ctz(padded_count); + cfg.divisor_p = padded_count >> (cfg.divisor_r + 1); + } else { + /* Large values so the modulo is a no-op */ + cfg.divisor_r = 0x1F; + cfg.divisor_p = 0x4; + } + } } static inline void panfrost_instance_id(unsigned padded_count, - struct mali_attribute_buffer_packed *attr, - bool instanced) + struct mali_attribute_buffer_packed *attr, bool instanced) { - pan_pack(attr, ATTRIBUTE_INSTANCE_ID, cfg) { - if (!instanced || padded_count <= 1) { - /* Divide by large number to force to 0 */ - cfg.divisor_p = ((1u << 31) - 1); - cfg.divisor_r = 0x1F; - cfg.divisor_e = 0x1; - } else if(util_is_power_of_two_or_zero(padded_count)) { - /* Can't underflow since padded_count >= 2 */ - cfg.divisor_r = __builtin_ctz(padded_count) - 1; - } else { - cfg.divisor_p = - panfrost_compute_magic_divisor(padded_count, - &cfg.divisor_r, &cfg.divisor_e); - } - } + pan_pack(attr, ATTRIBUTE_INSTANCE_ID, cfg) { + if (!instanced || padded_count <= 1) { + /* Divide by large number to force to 0 */ + cfg.divisor_p = ((1u << 31) - 1); + cfg.divisor_r = 0x1F; + cfg.divisor_e = 0x1; + } else if (util_is_power_of_two_or_zero(padded_count)) { + /* Can't underflow since padded_count >= 2 */ + cfg.divisor_r = __builtin_ctz(padded_count) - 1; + } else { + cfg.divisor_p = panfrost_compute_magic_divisor( + padded_count, &cfg.divisor_r, &cfg.divisor_e); + } + } } #endif /* PAN_ARCH <= 5 */ @@ -135,14 +128,18 @@ panfrost_instance_id(unsigned padded_count, static inline enum mali_func panfrost_flip_compare_func(enum mali_func f) { - switch (f) { - case MALI_FUNC_LESS: return MALI_FUNC_GREATER; - case MALI_FUNC_GREATER: return MALI_FUNC_LESS; - case MALI_FUNC_LEQUAL: return MALI_FUNC_GEQUAL; - case MALI_FUNC_GEQUAL: return MALI_FUNC_LEQUAL; - default: return f; - } - + switch (f) { + case MALI_FUNC_LESS: + return MALI_FUNC_GREATER; + case MALI_FUNC_GREATER: + return MALI_FUNC_LESS; + case MALI_FUNC_LEQUAL: + return MALI_FUNC_GEQUAL; + case MALI_FUNC_GEQUAL: + return MALI_FUNC_LEQUAL; + default: + return f; + } } #if PAN_ARCH <= 7 @@ -152,59 +149,59 @@ panfrost_flip_compare_func(enum mali_func f) * together in a dynamic bitfield, packed by this routine. */ static inline void -panfrost_pack_work_groups_compute( - struct mali_invocation_packed *out, - unsigned num_x, unsigned num_y, unsigned num_z, - unsigned size_x, unsigned size_y, unsigned size_z, - bool quirk_graphics, bool indirect_dispatch) +panfrost_pack_work_groups_compute(struct mali_invocation_packed *out, + unsigned num_x, unsigned num_y, + unsigned num_z, unsigned size_x, + unsigned size_y, unsigned size_z, + bool quirk_graphics, bool indirect_dispatch) { - /* The values needing packing, in order, and the corresponding shifts. - * Indicies into shift are off-by-one to make the logic easier */ + /* The values needing packing, in order, and the corresponding shifts. + * Indicies into shift are off-by-one to make the logic easier */ - unsigned values[6] = { size_x, size_y, size_z, num_x, num_y, num_z }; - unsigned shifts[7] = { 0 }; - uint32_t packed = 0; + unsigned values[6] = {size_x, size_y, size_z, num_x, num_y, num_z}; + unsigned shifts[7] = {0}; + uint32_t packed = 0; - for (unsigned i = 0; i < 6; ++i) { - /* Must be positive, otherwise we underflow */ - assert(values[i] >= 1); + for (unsigned i = 0; i < 6; ++i) { + /* Must be positive, otherwise we underflow */ + assert(values[i] >= 1); - /* OR it in, shifting as required */ - packed |= ((values[i] - 1) << shifts[i]); + /* OR it in, shifting as required */ + packed |= ((values[i] - 1) << shifts[i]); - /* How many bits did we use? */ - unsigned bit_count = util_logbase2_ceil(values[i]); + /* How many bits did we use? */ + unsigned bit_count = util_logbase2_ceil(values[i]); - /* Set the next shift accordingly */ - shifts[i + 1] = shifts[i] + bit_count; - } + /* Set the next shift accordingly */ + shifts[i + 1] = shifts[i] + bit_count; + } - pan_pack(out, INVOCATION, cfg) { - cfg.invocations = packed; - cfg.size_y_shift = shifts[1]; - cfg.size_z_shift = shifts[2]; - cfg.workgroups_x_shift = shifts[3]; + pan_pack(out, INVOCATION, cfg) { + cfg.invocations = packed; + cfg.size_y_shift = shifts[1]; + cfg.size_z_shift = shifts[2]; + cfg.workgroups_x_shift = shifts[3]; - if (!indirect_dispatch) { - /* Leave zero for the dispatch shader */ - cfg.workgroups_y_shift = shifts[4]; - cfg.workgroups_z_shift = shifts[5]; - } + if (!indirect_dispatch) { + /* Leave zero for the dispatch shader */ + cfg.workgroups_y_shift = shifts[4]; + cfg.workgroups_z_shift = shifts[5]; + } - /* Quirk: for non-instanced graphics, the blob sets - * workgroups_z_shift = 32. This doesn't appear to matter to - * the hardware, but it's good to be bit-identical. */ + /* Quirk: for non-instanced graphics, the blob sets + * workgroups_z_shift = 32. This doesn't appear to matter to + * the hardware, but it's good to be bit-identical. */ - if (quirk_graphics && (num_z <= 1)) - cfg.workgroups_z_shift = 32; + if (quirk_graphics && (num_z <= 1)) + cfg.workgroups_z_shift = 32; - /* For graphics, set to the minimum efficient value. For - * compute, must equal the workgroup X shift for barriers to - * function correctly */ + /* For graphics, set to the minimum efficient value. For + * compute, must equal the workgroup X shift for barriers to + * function correctly */ - cfg.thread_group_split = quirk_graphics ? - MALI_SPLIT_MIN_EFFICIENT : cfg.workgroups_x_shift; - } + cfg.thread_group_split = + quirk_graphics ? MALI_SPLIT_MIN_EFFICIENT : cfg.workgroups_x_shift; + } } #endif @@ -213,19 +210,19 @@ panfrost_pack_work_groups_compute( static inline enum mali_z_internal_format panfrost_get_z_internal_format(enum pipe_format fmt) { - switch (fmt) { - case PIPE_FORMAT_Z16_UNORM: - case PIPE_FORMAT_Z16_UNORM_S8_UINT: - return MALI_Z_INTERNAL_FORMAT_D16; - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_Z24X8_UNORM: - return MALI_Z_INTERNAL_FORMAT_D24; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return MALI_Z_INTERNAL_FORMAT_D32; - default: - unreachable("Unsupported depth/stencil format."); - } + switch (fmt) { + case PIPE_FORMAT_Z16_UNORM: + case PIPE_FORMAT_Z16_UNORM_S8_UINT: + return MALI_Z_INTERNAL_FORMAT_D16; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_Z24X8_UNORM: + return MALI_Z_INTERNAL_FORMAT_D24; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return MALI_Z_INTERNAL_FORMAT_D32; + default: + unreachable("Unsupported depth/stencil format."); + } } #endif @@ -236,13 +233,13 @@ static inline void panfrost_make_resource_table(struct panfrost_ptr base, unsigned index, mali_ptr address, unsigned resource_count) { - if (resource_count == 0) - return; + if (resource_count == 0) + return; - pan_pack(base.cpu + index * pan_size(RESOURCE), RESOURCE, cfg) { - cfg.address = address; - cfg.size = resource_count * pan_size(BUFFER); - } + pan_pack(base.cpu + index * pan_size(RESOURCE), RESOURCE, cfg) { + cfg.address = address; + cfg.size = resource_count * pan_size(BUFFER); + } } #endif diff --git a/src/panfrost/lib/pan_format.c b/src/panfrost/lib/pan_format.c index a2ffd1214d6..1b4007585b8 100644 --- a/src/panfrost/lib/pan_format.c +++ b/src/panfrost/lib/pan_format.c @@ -24,86 +24,90 @@ * Alyssa Rosenzweig */ -#include "genxml/gen_macros.h" #include "pan_format.h" +#include "genxml/gen_macros.h" #include "util/format/u_format.h" /* Convenience */ -#define MALI_BLEND_AU_R8G8B8A8 (MALI_RGBA8_TB << 12) -#define MALI_BLEND_PU_R8G8B8A8 (MALI_RGBA8_TB << 12) +#define MALI_BLEND_AU_R8G8B8A8 (MALI_RGBA8_TB << 12) +#define MALI_BLEND_PU_R8G8B8A8 (MALI_RGBA8_TB << 12) #define MALI_BLEND_AU_R10G10B10A2 (MALI_RGB10_A2_TB << 12) #define MALI_BLEND_PU_R10G10B10A2 (MALI_RGB10_A2_TB << 12) -#define MALI_BLEND_AU_R8G8B8A2 (MALI_RGB8_A2_AU << 12) -#define MALI_BLEND_PU_R8G8B8A2 (MALI_RGB8_A2_PU << 12) -#define MALI_BLEND_AU_R4G4B4A4 (MALI_RGBA4_AU << 12) -#define MALI_BLEND_PU_R4G4B4A4 (MALI_RGBA4_PU << 12) -#define MALI_BLEND_AU_R5G6B5A0 (MALI_R5G6B5_AU << 12) -#define MALI_BLEND_PU_R5G6B5A0 (MALI_R5G6B5_PU << 12) -#define MALI_BLEND_AU_R5G5B5A1 (MALI_RGB5_A1_AU << 12) -#define MALI_BLEND_PU_R5G5B5A1 (MALI_RGB5_A1_PU << 12) +#define MALI_BLEND_AU_R8G8B8A2 (MALI_RGB8_A2_AU << 12) +#define MALI_BLEND_PU_R8G8B8A2 (MALI_RGB8_A2_PU << 12) +#define MALI_BLEND_AU_R4G4B4A4 (MALI_RGBA4_AU << 12) +#define MALI_BLEND_PU_R4G4B4A4 (MALI_RGBA4_PU << 12) +#define MALI_BLEND_AU_R5G6B5A0 (MALI_R5G6B5_AU << 12) +#define MALI_BLEND_PU_R5G6B5A0 (MALI_R5G6B5_PU << 12) +#define MALI_BLEND_AU_R5G5B5A1 (MALI_RGB5_A1_AU << 12) +#define MALI_BLEND_PU_R5G5B5A1 (MALI_RGB5_A1_PU << 12) #if PAN_ARCH <= 6 -#define BFMT2(pipe, internal, writeback, srgb) \ - [PIPE_FORMAT_##pipe] = { \ - MALI_COLOR_BUFFER_INTERNAL_FORMAT_## internal, \ - MALI_COLOR_FORMAT_## writeback, \ - { MALI_BLEND_PU_ ## internal | (srgb ? (1 << 20) : 0) | \ - PAN_V6_SWIZZLE(R, G, B, A), \ - MALI_BLEND_AU_ ## internal | (srgb ? (1 << 20) : 0) | \ - PAN_V6_SWIZZLE(R, G, B, A), }, \ - } +#define BFMT2(pipe, internal, writeback, srgb) \ + [PIPE_FORMAT_##pipe] = { \ + MALI_COLOR_BUFFER_INTERNAL_FORMAT_##internal, \ + MALI_COLOR_FORMAT_##writeback, \ + { \ + MALI_BLEND_PU_##internal | (srgb ? (1 << 20) : 0) | \ + PAN_V6_SWIZZLE(R, G, B, A), \ + MALI_BLEND_AU_##internal | (srgb ? (1 << 20) : 0) | \ + PAN_V6_SWIZZLE(R, G, B, A), \ + }, \ + } #else -#define BFMT2(pipe, internal, writeback, srgb) \ - [PIPE_FORMAT_##pipe] = { \ - MALI_COLOR_BUFFER_INTERNAL_FORMAT_## internal, \ - MALI_COLOR_FORMAT_## writeback, \ - { MALI_BLEND_PU_ ## internal | (srgb ? (1 << 20) : 0), \ - MALI_BLEND_AU_ ## internal | (srgb ? (1 << 20) : 0), }, \ - } +#define BFMT2(pipe, internal, writeback, srgb) \ + [PIPE_FORMAT_##pipe] = { \ + MALI_COLOR_BUFFER_INTERNAL_FORMAT_##internal, \ + MALI_COLOR_FORMAT_##writeback, \ + { \ + MALI_BLEND_PU_##internal | (srgb ? (1 << 20) : 0), \ + MALI_BLEND_AU_##internal | (srgb ? (1 << 20) : 0), \ + }, \ + } #endif -#define BFMT(pipe, internal_and_writeback) \ - BFMT2(pipe, internal_and_writeback, internal_and_writeback, 0) +#define BFMT(pipe, internal_and_writeback) \ + BFMT2(pipe, internal_and_writeback, internal_and_writeback, 0) -#define BFMT_SRGB(pipe, writeback) \ - BFMT2(pipe ##_UNORM, R8G8B8A8, writeback, 0), \ - BFMT2(pipe ##_SRGB, R8G8B8A8, writeback, 1) +#define BFMT_SRGB(pipe, writeback) \ + BFMT2(pipe##_UNORM, R8G8B8A8, writeback, 0), \ + BFMT2(pipe##_SRGB, R8G8B8A8, writeback, 1) const struct pan_blendable_format -GENX(panfrost_blendable_formats)[PIPE_FORMAT_COUNT] = { - BFMT_SRGB(L8, R8), - BFMT_SRGB(L8A8, R8G8), - BFMT_SRGB(R8, R8), - BFMT_SRGB(R8G8, R8G8), - BFMT_SRGB(R8G8B8, R8G8B8), + GENX(panfrost_blendable_formats)[PIPE_FORMAT_COUNT] = { + BFMT_SRGB(L8, R8), + BFMT_SRGB(L8A8, R8G8), + BFMT_SRGB(R8, R8), + BFMT_SRGB(R8G8, R8G8), + BFMT_SRGB(R8G8B8, R8G8B8), - BFMT_SRGB(B8G8R8A8, R8G8B8A8), - BFMT_SRGB(B8G8R8X8, R8G8B8A8), - BFMT_SRGB(A8R8G8B8, R8G8B8A8), - BFMT_SRGB(X8R8G8B8, R8G8B8A8), - BFMT_SRGB(A8B8G8R8, R8G8B8A8), - BFMT_SRGB(X8B8G8R8, R8G8B8A8), - BFMT_SRGB(R8G8B8X8, R8G8B8A8), - BFMT_SRGB(R8G8B8A8, R8G8B8A8), + BFMT_SRGB(B8G8R8A8, R8G8B8A8), + BFMT_SRGB(B8G8R8X8, R8G8B8A8), + BFMT_SRGB(A8R8G8B8, R8G8B8A8), + BFMT_SRGB(X8R8G8B8, R8G8B8A8), + BFMT_SRGB(A8B8G8R8, R8G8B8A8), + BFMT_SRGB(X8B8G8R8, R8G8B8A8), + BFMT_SRGB(R8G8B8X8, R8G8B8A8), + BFMT_SRGB(R8G8B8A8, R8G8B8A8), - BFMT2(A8_UNORM, R8G8B8A8, R8, 0), - BFMT2(I8_UNORM, R8G8B8A8, R8, 0), - BFMT2(R5G6B5_UNORM, R5G6B5A0, R5G6B5, 0), - BFMT2(B5G6R5_UNORM, R5G6B5A0, R5G6B5, 0), + BFMT2(A8_UNORM, R8G8B8A8, R8, 0), + BFMT2(I8_UNORM, R8G8B8A8, R8, 0), + BFMT2(R5G6B5_UNORM, R5G6B5A0, R5G6B5, 0), + BFMT2(B5G6R5_UNORM, R5G6B5A0, R5G6B5, 0), - BFMT(A4B4G4R4_UNORM, R4G4B4A4), - BFMT(B4G4R4A4_UNORM, R4G4B4A4), - BFMT(R4G4B4A4_UNORM, R4G4B4A4), + BFMT(A4B4G4R4_UNORM, R4G4B4A4), + BFMT(B4G4R4A4_UNORM, R4G4B4A4), + BFMT(R4G4B4A4_UNORM, R4G4B4A4), - BFMT(R10G10B10A2_UNORM, R10G10B10A2), - BFMT(B10G10R10A2_UNORM, R10G10B10A2), - BFMT(R10G10B10X2_UNORM, R10G10B10A2), - BFMT(B10G10R10X2_UNORM, R10G10B10A2), + BFMT(R10G10B10A2_UNORM, R10G10B10A2), + BFMT(B10G10R10A2_UNORM, R10G10B10A2), + BFMT(R10G10B10X2_UNORM, R10G10B10A2), + BFMT(B10G10R10X2_UNORM, R10G10B10A2), - BFMT(B5G5R5A1_UNORM, R5G5B5A1), - BFMT(R5G5B5A1_UNORM, R5G5B5A1), - BFMT(B5G5R5X1_UNORM, R5G5B5A1), + BFMT(B5G5R5A1_UNORM, R5G5B5A1), + BFMT(R5G5B5A1_UNORM, R5G5B5A1), + BFMT(B5G5R5X1_UNORM, R5G5B5A1), }; /* Convenience */ @@ -145,13 +149,11 @@ GENX(panfrost_blendable_formats)[PIPE_FORMAT_COUNT] = { #define V6_RRRR PAN_V6_SWIZZLE(R, R, R, R) #define V6_GGGG PAN_V6_SWIZZLE(G, G, G, G) -#define FMT(pipe, mali, swizzle, srgb, flags) \ - [PIPE_FORMAT_ ## pipe] = { \ - .hw = ( V6_ ## swizzle ) | \ - (( MALI_ ## mali ) << 12) | \ - ((( SRGB_ ## srgb)) << 20), \ - .bind = FLAGS_ ## flags, \ - } +#define FMT(pipe, mali, swizzle, srgb, flags) \ + [PIPE_FORMAT_##pipe] = { \ + .hw = (V6_##swizzle) | ((MALI_##mali) << 12) | (((SRGB_##srgb)) << 20), \ + .bind = FLAGS_##flags, \ + } #else #define MALI_RGB_COMPONENT_ORDER_R001 MALI_RGB_COMPONENT_ORDER_RGB1 @@ -160,13 +162,12 @@ GENX(panfrost_blendable_formats)[PIPE_FORMAT_COUNT] = { #define MALI_RGB_COMPONENT_ORDER_GBA1 MALI_RGB_COMPONENT_ORDER_1RGB #define MALI_RGB_COMPONENT_ORDER_ABG1 MALI_RGB_COMPONENT_ORDER_1BGR -#define FMT(pipe, mali, swizzle, srgb, flags) \ - [PIPE_FORMAT_ ## pipe] = { \ - .hw = ( MALI_RGB_COMPONENT_ORDER_ ## swizzle ) | \ - (( MALI_ ## mali ) << 12) | \ - ((( SRGB_ ## srgb)) << 20), \ - .bind = FLAGS_ ## flags, \ - } +#define FMT(pipe, mali, swizzle, srgb, flags) \ + [PIPE_FORMAT_##pipe] = { \ + .hw = (MALI_RGB_COMPONENT_ORDER_##swizzle) | ((MALI_##mali) << 12) | \ + (((SRGB_##srgb)) << 20), \ + .bind = FLAGS_##flags, \ + } #endif /* clang-format off */ @@ -613,36 +614,40 @@ const struct panfrost_format GENX(panfrost_pipe_format)[PIPE_FORMAT_COUNT] = { struct pan_decomposed_swizzle GENX(pan_decompose_swizzle)(enum mali_rgb_component_order order) { -#define CASE(case_, pre_, R_, G_, B_, A_) \ - case MALI_RGB_COMPONENT_ORDER_##case_: \ - return (struct pan_decomposed_swizzle) { \ - MALI_RGB_COMPONENT_ORDER_##pre_, { \ - PIPE_SWIZZLE_##R_, PIPE_SWIZZLE_##G_, \ - PIPE_SWIZZLE_##B_, PIPE_SWIZZLE_##A_, \ - }, \ - }; +#define CASE(case_, pre_, R_, G_, B_, A_) \ + case MALI_RGB_COMPONENT_ORDER_##case_: \ + return (struct pan_decomposed_swizzle){ \ + MALI_RGB_COMPONENT_ORDER_##pre_, \ + { \ + PIPE_SWIZZLE_##R_, \ + PIPE_SWIZZLE_##G_, \ + PIPE_SWIZZLE_##B_, \ + PIPE_SWIZZLE_##A_, \ + }, \ + }; - switch (order) { - CASE(RGBA, RGBA, X, Y, Z, W); - CASE(GRBA, RGBA, Y, X, Z, W); - CASE(BGRA, RGBA, Z, Y, X, W); - CASE(ARGB, RGBA, Y, Z, W, X); - CASE(AGRB, RGBA, Z, Y, W, X); - CASE(ABGR, RGBA, W, Z, Y, X); - CASE(RGB1, RGB1, X, Y, Z, W); - CASE(GRB1, RGB1, Y, X, Z, W); - CASE(BGR1, RGB1, Z, Y, X, W); - CASE(1RGB, RGB1, Y, Z, W, X); - CASE(1GRB, RGB1, Z, Y, W, X); - CASE(1BGR, RGB1, W, Z, Y, X); - CASE(RRRR, RRRR, X, Y, Z, W); - CASE(RRR1, RRR1, X, Y, Z, W); - CASE(RRRA, RRRA, X, Y, Z, W); - CASE(000A, 000A, X, Y, Z, W); - CASE(0001, 0001, X, Y, Z, W); - CASE(0000, 0000, X, Y, Z, W); - default: unreachable("Invalid case for texturing"); - } + switch (order) { + CASE(RGBA, RGBA, X, Y, Z, W); + CASE(GRBA, RGBA, Y, X, Z, W); + CASE(BGRA, RGBA, Z, Y, X, W); + CASE(ARGB, RGBA, Y, Z, W, X); + CASE(AGRB, RGBA, Z, Y, W, X); + CASE(ABGR, RGBA, W, Z, Y, X); + CASE(RGB1, RGB1, X, Y, Z, W); + CASE(GRB1, RGB1, Y, X, Z, W); + CASE(BGR1, RGB1, Z, Y, X, W); + CASE(1RGB, RGB1, Y, Z, W, X); + CASE(1GRB, RGB1, Z, Y, W, X); + CASE(1BGR, RGB1, W, Z, Y, X); + CASE(RRRR, RRRR, X, Y, Z, W); + CASE(RRR1, RRR1, X, Y, Z, W); + CASE(RRRA, RRRA, X, Y, Z, W); + CASE(000A, 000A, X, Y, Z, W); + CASE(0001, 0001, X, Y, Z, W); + CASE(0000, 0000, X, Y, Z, W); + default: + unreachable("Invalid case for texturing"); + } #undef CASE } diff --git a/src/panfrost/lib/pan_format.h b/src/panfrost/lib/pan_format.h index a723a31b4d3..babf6d637c4 100644 --- a/src/panfrost/lib/pan_format.h +++ b/src/panfrost/lib/pan_format.h @@ -37,61 +37,62 @@ typedef uint32_t mali_pixel_format; struct panfrost_format { - mali_pixel_format hw; - unsigned bind; + mali_pixel_format hw; + unsigned bind; }; struct pan_blendable_format { - /* enum mali_color_buffer_internal_format */ uint16_t internal; - /* enum mali_mfbd_color_format */ uint16_t writeback; + /* enum mali_color_buffer_internal_format */ uint16_t internal; + /* enum mali_mfbd_color_format */ uint16_t writeback; - /* Indexed by the dithered? flag. So _PU first, then _AU */ - mali_pixel_format bifrost[2]; + /* Indexed by the dithered? flag. So _PU first, then _AU */ + mali_pixel_format bifrost[2]; }; -extern const struct pan_blendable_format panfrost_blendable_formats_v6[PIPE_FORMAT_COUNT]; -extern const struct pan_blendable_format panfrost_blendable_formats_v7[PIPE_FORMAT_COUNT]; -extern const struct pan_blendable_format panfrost_blendable_formats_v9[PIPE_FORMAT_COUNT]; +extern const struct pan_blendable_format + panfrost_blendable_formats_v6[PIPE_FORMAT_COUNT]; +extern const struct pan_blendable_format + panfrost_blendable_formats_v7[PIPE_FORMAT_COUNT]; +extern const struct pan_blendable_format + panfrost_blendable_formats_v9[PIPE_FORMAT_COUNT]; extern const struct panfrost_format panfrost_pipe_format_v6[PIPE_FORMAT_COUNT]; extern const struct panfrost_format panfrost_pipe_format_v7[PIPE_FORMAT_COUNT]; extern const struct panfrost_format panfrost_pipe_format_v9[PIPE_FORMAT_COUNT]; /* Helpers to construct swizzles */ -#define PAN_V6_SWIZZLE(R, G, B, A) ( \ - ((MALI_CHANNEL_ ## R) << 0) | \ - ((MALI_CHANNEL_ ## G) << 3) | \ - ((MALI_CHANNEL_ ## B) << 6) | \ - ((MALI_CHANNEL_ ## A) << 9)) +#define PAN_V6_SWIZZLE(R, G, B, A) \ + (((MALI_CHANNEL_##R) << 0) | ((MALI_CHANNEL_##G) << 3) | \ + ((MALI_CHANNEL_##B) << 6) | ((MALI_CHANNEL_##A) << 9)) static inline unsigned panfrost_get_default_swizzle(unsigned components) { - switch (components) { - case 1: - return PAN_V6_SWIZZLE(R, 0, 0, 1); - case 2: - return PAN_V6_SWIZZLE(R, G, 0, 1); - case 3: - return PAN_V6_SWIZZLE(R, G, B, 1); - case 4: - return PAN_V6_SWIZZLE(R, G, B, A); - default: - unreachable("Invalid number of components"); - } + switch (components) { + case 1: + return PAN_V6_SWIZZLE(R, 0, 0, 1); + case 2: + return PAN_V6_SWIZZLE(R, G, 0, 1); + case 3: + return PAN_V6_SWIZZLE(R, G, B, 1); + case 4: + return PAN_V6_SWIZZLE(R, G, B, A); + default: + unreachable("Invalid number of components"); + } } #if PAN_ARCH == 7 struct pan_decomposed_swizzle { - /* Component ordering to apply first */ - enum mali_rgb_component_order pre; + /* Component ordering to apply first */ + enum mali_rgb_component_order pre; - /* Bijective swizzle applied after */ - unsigned char post[4]; + /* Bijective swizzle applied after */ + unsigned char post[4]; }; struct pan_decomposed_swizzle -GENX(pan_decompose_swizzle)(enum mali_rgb_component_order order); + GENX(pan_decompose_swizzle)(enum mali_rgb_component_order order); #endif #endif diff --git a/src/panfrost/lib/pan_indirect_dispatch.c b/src/panfrost/lib/pan_indirect_dispatch.c index 8a6ad81167d..8f7e75e50ba 100644 --- a/src/panfrost/lib/pan_indirect_dispatch.c +++ b/src/panfrost/lib/pan_indirect_dispatch.c @@ -22,144 +22,146 @@ * */ -#include -#include "pan_bo.h" -#include "pan_shader.h" -#include "pan_scoreboard.h" -#include "pan_encoder.h" #include "pan_indirect_dispatch.h" -#include "pan_pool.h" -#include "pan_util.h" +#include #include "compiler/nir/nir_builder.h" -#include "util/u_memory.h" #include "util/macros.h" +#include "util/u_memory.h" +#include "pan_bo.h" +#include "pan_encoder.h" +#include "pan_pool.h" +#include "pan_scoreboard.h" +#include "pan_shader.h" +#include "pan_util.h" -#define get_input_field(b, name) \ - nir_load_push_constant(b, \ - 1, sizeof(((struct pan_indirect_dispatch_info *)0)->name) * 8, \ - nir_imm_int(b, 0), \ - .base = offsetof(struct pan_indirect_dispatch_info, name)) +#define get_input_field(b, name) \ + nir_load_push_constant( \ + b, 1, sizeof(((struct pan_indirect_dispatch_info *)0)->name) * 8, \ + nir_imm_int(b, 0), \ + .base = offsetof(struct pan_indirect_dispatch_info, name)) static mali_ptr get_rsd(const struct panfrost_device *dev) { - return dev->indirect_dispatch.descs->ptr.gpu; + return dev->indirect_dispatch.descs->ptr.gpu; } static mali_ptr get_tls(const struct panfrost_device *dev) { - return dev->indirect_dispatch.descs->ptr.gpu + - pan_size(RENDERER_STATE); + return dev->indirect_dispatch.descs->ptr.gpu + pan_size(RENDERER_STATE); } static void pan_indirect_dispatch_init(struct panfrost_device *dev) { - nir_builder b = - nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, - GENX(pan_shader_get_compiler_options)(), - "%s", "indirect_dispatch"); - nir_ssa_def *zero = nir_imm_int(&b, 0); - nir_ssa_def *one = nir_imm_int(&b, 1); - nir_ssa_def *num_wg = nir_load_global(&b, get_input_field(&b, indirect_dim), 4, 3, 32); - nir_ssa_def *num_wg_x = nir_channel(&b, num_wg, 0); - nir_ssa_def *num_wg_y = nir_channel(&b, num_wg, 1); - nir_ssa_def *num_wg_z = nir_channel(&b, num_wg, 2); + nir_builder b = nir_builder_init_simple_shader( + MESA_SHADER_COMPUTE, GENX(pan_shader_get_compiler_options)(), "%s", + "indirect_dispatch"); + nir_ssa_def *zero = nir_imm_int(&b, 0); + nir_ssa_def *one = nir_imm_int(&b, 1); + nir_ssa_def *num_wg = + nir_load_global(&b, get_input_field(&b, indirect_dim), 4, 3, 32); + nir_ssa_def *num_wg_x = nir_channel(&b, num_wg, 0); + nir_ssa_def *num_wg_y = nir_channel(&b, num_wg, 1); + nir_ssa_def *num_wg_z = nir_channel(&b, num_wg, 2); - nir_ssa_def *job_hdr_ptr = get_input_field(&b, job); - nir_ssa_def *num_wg_flat = nir_imul(&b, num_wg_x, nir_imul(&b, num_wg_y, num_wg_z)); + nir_ssa_def *job_hdr_ptr = get_input_field(&b, job); + nir_ssa_def *num_wg_flat = + nir_imul(&b, num_wg_x, nir_imul(&b, num_wg_y, num_wg_z)); - nir_push_if(&b, nir_ieq(&b, num_wg_flat, zero)); - { - nir_ssa_def *type_ptr = nir_iadd(&b, job_hdr_ptr, nir_imm_int64(&b, 4 * 4)); - nir_ssa_def *ntype = nir_imm_intN_t(&b, (MALI_JOB_TYPE_NULL << 1) | 1, 8); - nir_store_global(&b, type_ptr, 1, ntype, 1); - } - nir_push_else(&b, NULL); - { - nir_ssa_def *job_dim_ptr = nir_iadd(&b, job_hdr_ptr, - nir_imm_int64(&b, pan_section_offset(COMPUTE_JOB, INVOCATION))); - nir_ssa_def *num_wg_x_m1 = nir_isub(&b, num_wg_x, one); - nir_ssa_def *num_wg_y_m1 = nir_isub(&b, num_wg_y, one); - nir_ssa_def *num_wg_z_m1 = nir_isub(&b, num_wg_z, one); - nir_ssa_def *job_dim = nir_load_global(&b, job_dim_ptr, 8, 2, 32); - nir_ssa_def *dims = nir_channel(&b, job_dim, 0); - nir_ssa_def *split = nir_channel(&b, job_dim, 1); - nir_ssa_def *num_wg_x_split = nir_iand_imm(&b, nir_ushr_imm(&b, split, 10), 0x3f); - nir_ssa_def *num_wg_y_split = nir_iadd(&b, num_wg_x_split, - nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_x_m1))); - nir_ssa_def *num_wg_z_split = nir_iadd(&b, num_wg_y_split, - nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_y_m1))); - split = nir_ior(&b, split, - nir_ior(&b, - nir_ishl(&b, num_wg_y_split, nir_imm_int(&b, 16)), - nir_ishl(&b, num_wg_z_split, nir_imm_int(&b, 22)))); - dims = nir_ior(&b, dims, - nir_ior(&b, nir_ishl(&b, num_wg_x_m1, num_wg_x_split), - nir_ior(&b, nir_ishl(&b, num_wg_y_m1, num_wg_y_split), - nir_ishl(&b, num_wg_z_m1, num_wg_z_split)))); + nir_push_if(&b, nir_ieq(&b, num_wg_flat, zero)); + { + nir_ssa_def *type_ptr = + nir_iadd(&b, job_hdr_ptr, nir_imm_int64(&b, 4 * 4)); + nir_ssa_def *ntype = nir_imm_intN_t(&b, (MALI_JOB_TYPE_NULL << 1) | 1, 8); + nir_store_global(&b, type_ptr, 1, ntype, 1); + } + nir_push_else(&b, NULL); + { + nir_ssa_def *job_dim_ptr = nir_iadd( + &b, job_hdr_ptr, + nir_imm_int64(&b, pan_section_offset(COMPUTE_JOB, INVOCATION))); + nir_ssa_def *num_wg_x_m1 = nir_isub(&b, num_wg_x, one); + nir_ssa_def *num_wg_y_m1 = nir_isub(&b, num_wg_y, one); + nir_ssa_def *num_wg_z_m1 = nir_isub(&b, num_wg_z, one); + nir_ssa_def *job_dim = nir_load_global(&b, job_dim_ptr, 8, 2, 32); + nir_ssa_def *dims = nir_channel(&b, job_dim, 0); + nir_ssa_def *split = nir_channel(&b, job_dim, 1); + nir_ssa_def *num_wg_x_split = + nir_iand_imm(&b, nir_ushr_imm(&b, split, 10), 0x3f); + nir_ssa_def *num_wg_y_split = nir_iadd( + &b, num_wg_x_split, nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_x_m1))); + nir_ssa_def *num_wg_z_split = nir_iadd( + &b, num_wg_y_split, nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_y_m1))); + split = + nir_ior(&b, split, + nir_ior(&b, nir_ishl(&b, num_wg_y_split, nir_imm_int(&b, 16)), + nir_ishl(&b, num_wg_z_split, nir_imm_int(&b, 22)))); + dims = + nir_ior(&b, dims, + nir_ior(&b, nir_ishl(&b, num_wg_x_m1, num_wg_x_split), + nir_ior(&b, nir_ishl(&b, num_wg_y_m1, num_wg_y_split), + nir_ishl(&b, num_wg_z_m1, num_wg_z_split)))); - nir_store_global(&b, job_dim_ptr, 8, nir_vec2(&b, dims, split), 3); + nir_store_global(&b, job_dim_ptr, 8, nir_vec2(&b, dims, split), 3); - nir_ssa_def *num_wg_x_ptr = get_input_field(&b, num_wg_sysval[0]); + nir_ssa_def *num_wg_x_ptr = get_input_field(&b, num_wg_sysval[0]); - nir_push_if(&b, nir_ine(&b, num_wg_x_ptr, nir_imm_int64(&b, 0))); - { - nir_store_global(&b, num_wg_x_ptr, 8, num_wg_x, 1); - nir_store_global(&b, get_input_field(&b, num_wg_sysval[1]), 8, num_wg_y, 1); - nir_store_global(&b, get_input_field(&b, num_wg_sysval[2]), 8, num_wg_z, 1); - } - nir_pop_if(&b, NULL); - } + nir_push_if(&b, nir_ine(&b, num_wg_x_ptr, nir_imm_int64(&b, 0))); + { + nir_store_global(&b, num_wg_x_ptr, 8, num_wg_x, 1); + nir_store_global(&b, get_input_field(&b, num_wg_sysval[1]), 8, + num_wg_y, 1); + nir_store_global(&b, get_input_field(&b, num_wg_sysval[2]), 8, + num_wg_z, 1); + } + nir_pop_if(&b, NULL); + } - nir_pop_if(&b, NULL); + nir_pop_if(&b, NULL); - struct panfrost_compile_inputs inputs = { - .gpu_id = dev->gpu_id, - .fixed_sysval_ubo = -1, - .no_ubo_to_push = true, - }; - struct pan_shader_info shader_info; - struct util_dynarray binary; + struct panfrost_compile_inputs inputs = { + .gpu_id = dev->gpu_id, + .fixed_sysval_ubo = -1, + .no_ubo_to_push = true, + }; + struct pan_shader_info shader_info; + struct util_dynarray binary; - util_dynarray_init(&binary, NULL); - GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info); + util_dynarray_init(&binary, NULL); + GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info); - ralloc_free(b.shader); + ralloc_free(b.shader); - assert(!shader_info.tls_size); - assert(!shader_info.wls_size); - assert(!shader_info.sysvals.sysval_count); + assert(!shader_info.tls_size); + assert(!shader_info.wls_size); + assert(!shader_info.sysvals.sysval_count); - shader_info.push.count = - DIV_ROUND_UP(sizeof(struct pan_indirect_dispatch_info), 4); + shader_info.push.count = + DIV_ROUND_UP(sizeof(struct pan_indirect_dispatch_info), 4); - dev->indirect_dispatch.bin = - panfrost_bo_create(dev, binary.size, PAN_BO_EXECUTE, - "Indirect dispatch shader"); + dev->indirect_dispatch.bin = panfrost_bo_create( + dev, binary.size, PAN_BO_EXECUTE, "Indirect dispatch shader"); - memcpy(dev->indirect_dispatch.bin->ptr.cpu, binary.data, binary.size); - util_dynarray_fini(&binary); + memcpy(dev->indirect_dispatch.bin->ptr.cpu, binary.data, binary.size); + util_dynarray_fini(&binary); - dev->indirect_dispatch.descs = - panfrost_bo_create(dev, - pan_size(RENDERER_STATE) + - pan_size(LOCAL_STORAGE), - 0, "Indirect dispatch descriptors"); + dev->indirect_dispatch.descs = panfrost_bo_create( + dev, pan_size(RENDERER_STATE) + pan_size(LOCAL_STORAGE), 0, + "Indirect dispatch descriptors"); - mali_ptr address = dev->indirect_dispatch.bin->ptr.gpu; + mali_ptr address = dev->indirect_dispatch.bin->ptr.gpu; - void *rsd = dev->indirect_dispatch.descs->ptr.cpu; - pan_pack(rsd, RENDERER_STATE, cfg) { - pan_shader_prepare_rsd(&shader_info, address, &cfg); - } + void *rsd = dev->indirect_dispatch.descs->ptr.cpu; + pan_pack(rsd, RENDERER_STATE, cfg) { + pan_shader_prepare_rsd(&shader_info, address, &cfg); + } - void *tsd = dev->indirect_dispatch.descs->ptr.cpu + - pan_size(RENDERER_STATE); - pan_pack(tsd, LOCAL_STORAGE, ls) { - ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; - }; + void *tsd = dev->indirect_dispatch.descs->ptr.cpu + pan_size(RENDERER_STATE); + pan_pack(tsd, LOCAL_STORAGE, ls) { + ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; + }; } unsigned @@ -167,38 +169,35 @@ GENX(pan_indirect_dispatch_emit)(struct pan_pool *pool, struct pan_scoreboard *scoreboard, const struct pan_indirect_dispatch_info *inputs) { - struct panfrost_device *dev = pool->dev; - struct panfrost_ptr job = - pan_pool_alloc_desc(pool, COMPUTE_JOB); - void *invocation = - pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); + struct panfrost_device *dev = pool->dev; + struct panfrost_ptr job = pan_pool_alloc_desc(pool, COMPUTE_JOB); + void *invocation = pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); - /* If we haven't compiled the indirect dispatch shader yet, do it now */ - if (!dev->indirect_dispatch.bin) - pan_indirect_dispatch_init(dev); + /* If we haven't compiled the indirect dispatch shader yet, do it now */ + if (!dev->indirect_dispatch.bin) + pan_indirect_dispatch_init(dev); - panfrost_pack_work_groups_compute(invocation, - 1, 1, 1, 1, 1, 1, - false, false); + panfrost_pack_work_groups_compute(invocation, 1, 1, 1, 1, 1, 1, false, + false); - pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { - cfg.job_task_split = 2; - } + pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { + cfg.job_task_split = 2; + } - pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { - cfg.state = get_rsd(dev); - cfg.thread_storage = get_tls(pool->dev); - cfg.push_uniforms = - pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16); - } + pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { + cfg.state = get_rsd(dev); + cfg.thread_storage = get_tls(pool->dev); + cfg.push_uniforms = + pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16); + } - return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE, - false, true, 0, 0, &job, false); + return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE, false, true, + 0, 0, &job, false); } void GENX(pan_indirect_dispatch_cleanup)(struct panfrost_device *dev) { - panfrost_bo_unreference(dev->indirect_dispatch.bin); - panfrost_bo_unreference(dev->indirect_dispatch.descs); + panfrost_bo_unreference(dev->indirect_dispatch.bin); + panfrost_bo_unreference(dev->indirect_dispatch.descs); } diff --git a/src/panfrost/lib/pan_indirect_dispatch.h b/src/panfrost/lib/pan_indirect_dispatch.h index f39e5f9fce4..0dd86f04988 100644 --- a/src/panfrost/lib/pan_indirect_dispatch.h +++ b/src/panfrost/lib/pan_indirect_dispatch.h @@ -24,25 +24,23 @@ #ifndef __PAN_INDIRECT_DISPATCH_SHADERS_H__ #define __PAN_INDIRECT_DISPATCH_SHADERS_H__ -#include "pan_scoreboard.h" #include "genxml/gen_macros.h" +#include "pan_scoreboard.h" struct pan_device; struct pan_scoreboard; struct pan_pool; struct pan_indirect_dispatch_info { - mali_ptr job; - mali_ptr indirect_dim; - mali_ptr num_wg_sysval[3]; + mali_ptr job; + mali_ptr indirect_dim; + mali_ptr num_wg_sysval[3]; } PACKED; -unsigned -GENX(pan_indirect_dispatch_emit)(struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - const struct pan_indirect_dispatch_info *dispatch_info); +unsigned GENX(pan_indirect_dispatch_emit)( + struct pan_pool *pool, struct pan_scoreboard *scoreboard, + const struct pan_indirect_dispatch_info *dispatch_info); -void -GENX(pan_indirect_dispatch_cleanup)(struct panfrost_device *dev); +void GENX(pan_indirect_dispatch_cleanup)(struct panfrost_device *dev); #endif diff --git a/src/panfrost/lib/pan_layout.c b/src/panfrost/lib/pan_layout.c index bcb5af97f4f..981779c9bc3 100644 --- a/src/panfrost/lib/pan_layout.c +++ b/src/panfrost/lib/pan_layout.c @@ -35,33 +35,27 @@ /* clang-format on */ uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT] = { DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_TILED | - AFBC_FORMAT_MOD_SC | - AFBC_FORMAT_MOD_SPARSE | - AFBC_FORMAT_MOD_YTR), + AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SC | + AFBC_FORMAT_MOD_SPARSE | AFBC_FORMAT_MOD_YTR), DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_TILED | - AFBC_FORMAT_MOD_SC | + AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SC | AFBC_FORMAT_MOD_SPARSE), DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_SPARSE | - AFBC_FORMAT_MOD_YTR), + AFBC_FORMAT_MOD_SPARSE | AFBC_FORMAT_MOD_YTR), DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | AFBC_FORMAT_MOD_SPARSE), DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, - DRM_FORMAT_MOD_LINEAR -}; + DRM_FORMAT_MOD_LINEAR}; /* Table of AFBC superblock sizes */ -static const struct pan_block_size -afbc_superblock_sizes[] = { - [AFBC_FORMAT_MOD_BLOCK_SIZE_16x16] = { 16, 16 }, - [AFBC_FORMAT_MOD_BLOCK_SIZE_32x8] = { 32, 8 }, - [AFBC_FORMAT_MOD_BLOCK_SIZE_64x4] = { 64, 4 }, +static const struct pan_block_size afbc_superblock_sizes[] = { + [AFBC_FORMAT_MOD_BLOCK_SIZE_16x16] = {16, 16}, + [AFBC_FORMAT_MOD_BLOCK_SIZE_32x8] = {32, 8}, + [AFBC_FORMAT_MOD_BLOCK_SIZE_64x4] = {64, 4}, }; /* clang-format off */ diff --git a/src/panfrost/lib/pan_pool.h b/src/panfrost/lib/pan_pool.h index 01c8348c41d..c499994399a 100644 --- a/src/panfrost/lib/pan_pool.h +++ b/src/panfrost/lib/pan_pool.h @@ -34,100 +34,99 @@ /* Represents grow-only memory. */ struct pan_pool { - /* Parent device for allocation */ - struct panfrost_device *dev; + /* Parent device for allocation */ + struct panfrost_device *dev; - /* Label for created BOs */ - const char *label; + /* Label for created BOs */ + const char *label; - /* BO flags to use in the pool */ - unsigned create_flags; + /* BO flags to use in the pool */ + unsigned create_flags; - /* Minimum size for allocated BOs. */ - size_t slab_size; + /* Minimum size for allocated BOs. */ + size_t slab_size; }; static inline void pan_pool_init(struct pan_pool *pool, struct panfrost_device *dev, unsigned create_flags, size_t slab_size, const char *label) { - pool->dev = dev; - pool->create_flags = create_flags; - pool->slab_size = slab_size; - pool->label = label; + pool->dev = dev; + pool->create_flags = create_flags; + pool->slab_size = slab_size; + pool->label = label; } /* Represents a fat pointer for GPU-mapped memory, returned from the transient * allocator and not used for much else */ -struct panfrost_ptr -pan_pool_alloc_aligned(struct pan_pool *pool, size_t sz, unsigned alignment); +struct panfrost_ptr pan_pool_alloc_aligned(struct pan_pool *pool, size_t sz, + unsigned alignment); -#define PAN_POOL_ALLOCATOR(pool_subclass, alloc_func) \ -struct panfrost_ptr \ -pan_pool_alloc_aligned(struct pan_pool *p, size_t sz, unsigned alignment) \ -{ \ - pool_subclass *pool = container_of(p, pool_subclass, base); \ - return alloc_func(pool, sz, alignment); \ -} +#define PAN_POOL_ALLOCATOR(pool_subclass, alloc_func) \ + struct panfrost_ptr pan_pool_alloc_aligned(struct pan_pool *p, size_t sz, \ + unsigned alignment) \ + { \ + pool_subclass *pool = container_of(p, pool_subclass, base); \ + return alloc_func(pool, sz, alignment); \ + } static inline mali_ptr -pan_pool_upload_aligned(struct pan_pool *pool, const void *data, size_t sz, unsigned alignment) +pan_pool_upload_aligned(struct pan_pool *pool, const void *data, size_t sz, + unsigned alignment) { - struct panfrost_ptr transfer = pan_pool_alloc_aligned(pool, sz, alignment); - memcpy(transfer.cpu, data, sz); - return transfer.gpu; + struct panfrost_ptr transfer = pan_pool_alloc_aligned(pool, sz, alignment); + memcpy(transfer.cpu, data, sz); + return transfer.gpu; } static inline mali_ptr pan_pool_upload(struct pan_pool *pool, const void *data, size_t sz) { - return pan_pool_upload_aligned(pool, data, sz, sz); + return pan_pool_upload_aligned(pool, data, sz, sz); } struct pan_desc_alloc_info { - unsigned size; - unsigned align; - unsigned nelems; + unsigned size; + unsigned align; + unsigned nelems; }; -#define PAN_DESC_ARRAY(count, name) \ - { \ - .size = pan_size(name), \ - .align = pan_alignment(name), \ - .nelems = count, \ - } +#define PAN_DESC_ARRAY(count, name) \ + { \ + .size = pan_size(name), .align = pan_alignment(name), .nelems = count, \ + } #define PAN_DESC(name) PAN_DESC_ARRAY(1, name) -#define PAN_DESC_AGGREGATE(...) \ - (struct pan_desc_alloc_info[]) { \ - __VA_ARGS__, \ - { 0 }, \ - } +#define PAN_DESC_AGGREGATE(...) \ + (struct pan_desc_alloc_info[]) \ + { \ + __VA_ARGS__, {0}, \ + } static inline struct panfrost_ptr pan_pool_alloc_descs(struct pan_pool *pool, const struct pan_desc_alloc_info *descs) { - unsigned size = 0; - unsigned align = descs[0].align; + unsigned size = 0; + unsigned align = descs[0].align; - for (unsigned i = 0; descs[i].size; i++) { - assert(!(size & (descs[i].align - 1))); - size += descs[i].size * descs[i].nelems; - } + for (unsigned i = 0; descs[i].size; i++) { + assert(!(size & (descs[i].align - 1))); + size += descs[i].size * descs[i].nelems; + } - return pan_pool_alloc_aligned(pool, size, align); + return pan_pool_alloc_aligned(pool, size, align); } -#define pan_pool_alloc_desc(pool, name) \ - pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(PAN_DESC(name))) +#define pan_pool_alloc_desc(pool, name) \ + pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(PAN_DESC(name))) -#define pan_pool_alloc_desc_array(pool, count, name) \ - pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(PAN_DESC_ARRAY(count, name))) +#define pan_pool_alloc_desc_array(pool, count, name) \ + pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(PAN_DESC_ARRAY(count, name))) -#define pan_pool_alloc_desc_aggregate(pool, ...) \ - pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(__VA_ARGS__)) +#define pan_pool_alloc_desc_aggregate(pool, ...) \ + pan_pool_alloc_descs(pool, PAN_DESC_AGGREGATE(__VA_ARGS__)) #endif diff --git a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c index f7a5d22bbf5..7b698dec8e8 100644 --- a/src/panfrost/lib/pan_props.c +++ b/src/panfrost/lib/pan_props.c @@ -26,31 +26,30 @@ #include -#include "util/u_math.h" -#include "util/macros.h" -#include "util/hash_table.h" -#include "util/u_thread.h" #include "drm-uapi/panfrost_drm.h" -#include "pan_encoder.h" -#include "pan_device.h" +#include "util/hash_table.h" +#include "util/macros.h" +#include "util/u_math.h" +#include "util/u_thread.h" #include "pan_bo.h" +#include "pan_device.h" +#include "pan_encoder.h" #include "pan_texture.h" -#include "wrap.h" #include "pan_util.h" +#include "wrap.h" /* Fixed "minimum revisions" */ -#define NO_ANISO (~0) +#define NO_ANISO (~0) #define HAS_ANISO (0) -#define MODEL(gpu_id_, shortname, counters_, min_rev_anisotropic_, tib_size_, quirks_) \ - { \ - .gpu_id = gpu_id_, \ - .name = "Mali-" shortname " (Panfrost)", \ - .performance_counters = counters_, \ - .min_rev_anisotropic = min_rev_anisotropic_, \ - .tilebuffer_size = tib_size_, \ - .quirks = quirks_, \ - } +#define MODEL(gpu_id_, shortname, counters_, min_rev_anisotropic_, tib_size_, \ + quirks_) \ + { \ + .gpu_id = gpu_id_, .name = "Mali-" shortname " (Panfrost)", \ + .performance_counters = counters_, \ + .min_rev_anisotropic = min_rev_anisotropic_, \ + .tilebuffer_size = tib_size_, .quirks = quirks_, \ + } /* Table of supported Mali GPUs */ /* clang-format off */ @@ -85,92 +84,90 @@ const struct panfrost_model panfrost_model_list[] = { const struct panfrost_model * panfrost_get_model(uint32_t gpu_id) { - for (unsigned i = 0; i < ARRAY_SIZE(panfrost_model_list); ++i) { - if (panfrost_model_list[i].gpu_id == gpu_id) - return &panfrost_model_list[i]; - } + for (unsigned i = 0; i < ARRAY_SIZE(panfrost_model_list); ++i) { + if (panfrost_model_list[i].gpu_id == gpu_id) + return &panfrost_model_list[i]; + } - return NULL; + return NULL; } /* Abstraction over the raw drm_panfrost_get_param ioctl for fetching * information about devices */ static __u64 -panfrost_query_raw( - int fd, - enum drm_panfrost_param param, - bool required, - unsigned default_value) +panfrost_query_raw(int fd, enum drm_panfrost_param param, bool required, + unsigned default_value) { - struct drm_panfrost_get_param get_param = {0,}; - ASSERTED int ret; + struct drm_panfrost_get_param get_param = { + 0, + }; + ASSERTED int ret; - get_param.param = param; - ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); + get_param.param = param; + ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); - if (ret) { - assert(!required); - return default_value; - } + if (ret) { + assert(!required); + return default_value; + } - return get_param.value; + return get_param.value; } static unsigned panfrost_query_gpu_version(int fd) { - return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0); + return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0); } static unsigned panfrost_query_gpu_revision(int fd) { - return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_REVISION, true, 0); + return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_REVISION, true, 0); } unsigned panfrost_query_l2_slices(const struct panfrost_device *dev) { - /* Query MEM_FEATURES register */ - uint32_t mem_features = - panfrost_query_raw(dev->fd, DRM_PANFROST_PARAM_MEM_FEATURES, - true, 0); + /* Query MEM_FEATURES register */ + uint32_t mem_features = + panfrost_query_raw(dev->fd, DRM_PANFROST_PARAM_MEM_FEATURES, true, 0); - /* L2_SLICES is MEM_FEATURES[11:8] minus(1) */ - return ((mem_features >> 8) & 0xF) + 1; + /* L2_SLICES is MEM_FEATURES[11:8] minus(1) */ + return ((mem_features >> 8) & 0xF) + 1; } static struct panfrost_tiler_features panfrost_query_tiler_features(int fd) { - /* Default value (2^9 bytes and 8 levels) to match old behaviour */ - uint32_t raw = panfrost_query_raw(fd, DRM_PANFROST_PARAM_TILER_FEATURES, - false, 0x809); + /* Default value (2^9 bytes and 8 levels) to match old behaviour */ + uint32_t raw = + panfrost_query_raw(fd, DRM_PANFROST_PARAM_TILER_FEATURES, false, 0x809); - /* Bin size is log2 in the first byte, max levels in the second byte */ - return (struct panfrost_tiler_features) { - .bin_size = (1 << (raw & BITFIELD_MASK(5))), - .max_levels = (raw >> 8) & BITFIELD_MASK(4), - }; + /* Bin size is log2 in the first byte, max levels in the second byte */ + return (struct panfrost_tiler_features){ + .bin_size = (1 << (raw & BITFIELD_MASK(5))), + .max_levels = (raw >> 8) & BITFIELD_MASK(4), + }; } static unsigned panfrost_query_core_count(int fd, unsigned *core_id_range) { - /* On older kernels, worst-case to 16 cores */ + /* On older kernels, worst-case to 16 cores */ - unsigned mask = panfrost_query_raw(fd, - DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff); + unsigned mask = + panfrost_query_raw(fd, DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff); - /* Some cores might be absent. In some cases, we care - * about the range of core IDs (that is, the greatest core ID + 1). If - * the core mask is contiguous, this equals the core count. - */ - *core_id_range = util_last_bit(mask); + /* Some cores might be absent. In some cases, we care + * about the range of core IDs (that is, the greatest core ID + 1). If + * the core mask is contiguous, this equals the core count. + */ + *core_id_range = util_last_bit(mask); - /* The actual core count skips overs the gaps */ - return util_bitcount(mask); + /* The actual core count skips overs the gaps */ + return util_bitcount(mask); } /* Architectural maximums, since this register may be not implemented @@ -180,57 +177,52 @@ panfrost_query_core_count(int fd, unsigned *core_id_range) static unsigned panfrost_max_thread_count(unsigned arch) { - switch (arch) { - /* Midgard */ - case 4: - case 5: - return 256; + switch (arch) { + /* Midgard */ + case 4: + case 5: + return 256; - /* Bifrost, first generation */ - case 6: - return 384; + /* Bifrost, first generation */ + case 6: + return 384; - /* Bifrost, second generation (G31 is 512 but it doesn't matter) */ - case 7: - return 768; + /* Bifrost, second generation (G31 is 512 but it doesn't matter) */ + case 7: + return 768; - /* Valhall (for completeness) */ - default: - return 1024; - } + /* Valhall (for completeness) */ + default: + return 1024; + } } static unsigned panfrost_query_thread_tls_alloc(int fd, unsigned major) { - unsigned tls = panfrost_query_raw(fd, - DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 0); + unsigned tls = + panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 0); - return (tls > 0) ? tls : panfrost_max_thread_count(major); + return (tls > 0) ? tls : panfrost_max_thread_count(major); } static uint32_t panfrost_query_compressed_formats(int fd) { - /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and - * should exist on any Mali configuration. All hardware should report - * these texture formats but the kernel might not be new enough. */ + /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and + * should exist on any Mali configuration. All hardware should report + * these texture formats but the kernel might not be new enough. */ - uint32_t default_set = - (1 << MALI_ETC2_RGB8) | - (1 << MALI_ETC2_R11_UNORM) | - (1 << MALI_ETC2_RGBA8) | - (1 << MALI_ETC2_RG11_UNORM) | - (1 << MALI_ETC2_R11_SNORM) | - (1 << MALI_ETC2_RG11_SNORM) | - (1 << MALI_ETC2_RGB8A1) | - (1 << MALI_ASTC_3D_LDR) | - (1 << MALI_ASTC_3D_HDR) | - (1 << MALI_ASTC_2D_LDR) | - (1 << MALI_ASTC_2D_HDR); + uint32_t default_set = (1 << MALI_ETC2_RGB8) | (1 << MALI_ETC2_R11_UNORM) | + (1 << MALI_ETC2_RGBA8) | (1 << MALI_ETC2_RG11_UNORM) | + (1 << MALI_ETC2_R11_SNORM) | + (1 << MALI_ETC2_RG11_SNORM) | + (1 << MALI_ETC2_RGB8A1) | (1 << MALI_ASTC_3D_LDR) | + (1 << MALI_ASTC_3D_HDR) | (1 << MALI_ASTC_2D_LDR) | + (1 << MALI_ASTC_2D_HDR); - return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0, - false, default_set); + return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0, false, + default_set); } /* DRM_PANFROST_PARAM_TEXTURE_FEATURES0 will return a bitmask of supported @@ -239,13 +231,13 @@ panfrost_query_compressed_formats(int fd) bool panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt) { - if (MALI_EXTRACT_TYPE(fmt) != MALI_FORMAT_COMPRESSED) - return true; + if (MALI_EXTRACT_TYPE(fmt) != MALI_FORMAT_COMPRESSED) + return true; - unsigned idx = fmt & ~MALI_FORMAT_COMPRESSED; - assert(idx < 32); + unsigned idx = fmt & ~MALI_FORMAT_COMPRESSED; + assert(idx < 32); - return dev->compressed_formats & (1 << idx); + return dev->compressed_formats & (1 << idx); } /* Check for AFBC hardware support. AFBC is introduced in v5. Implementations @@ -254,11 +246,10 @@ panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt) static bool panfrost_query_afbc(int fd, unsigned arch) { - unsigned reg = panfrost_query_raw(fd, - DRM_PANFROST_PARAM_AFBC_FEATURES, - false, 0); + unsigned reg = + panfrost_query_raw(fd, DRM_PANFROST_PARAM_AFBC_FEATURES, false, 0); - return (arch >= 5) && (reg == 0); + return (arch >= 5) && (reg == 0); } /* @@ -271,83 +262,83 @@ panfrost_query_afbc(int fd, unsigned arch) static unsigned panfrost_query_optimal_tib_size(const struct panfrost_device *dev) { - /* Preconditions ensure the returned value is a multiple of 1 KiB, the - * granularity of the colour buffer allocation field. - */ - assert(dev->model->tilebuffer_size >= 2048); - assert(util_is_power_of_two_nonzero(dev->model->tilebuffer_size)); + /* Preconditions ensure the returned value is a multiple of 1 KiB, the + * granularity of the colour buffer allocation field. + */ + assert(dev->model->tilebuffer_size >= 2048); + assert(util_is_power_of_two_nonzero(dev->model->tilebuffer_size)); - return dev->model->tilebuffer_size / 2; + return dev->model->tilebuffer_size / 2; } void panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) { - dev->fd = fd; - dev->memctx = memctx; - dev->gpu_id = panfrost_query_gpu_version(fd); - dev->arch = pan_arch(dev->gpu_id); - dev->kernel_version = drmGetVersion(fd); - dev->revision = panfrost_query_gpu_revision(fd); - dev->model = panfrost_get_model(dev->gpu_id); + dev->fd = fd; + dev->memctx = memctx; + dev->gpu_id = panfrost_query_gpu_version(fd); + dev->arch = pan_arch(dev->gpu_id); + dev->kernel_version = drmGetVersion(fd); + dev->revision = panfrost_query_gpu_revision(fd); + dev->model = panfrost_get_model(dev->gpu_id); - /* If we don't recognize the model, bail early */ - if (!dev->model) - return; + /* If we don't recognize the model, bail early */ + if (!dev->model) + return; - dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range); - dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch); - dev->optimal_tib_size = panfrost_query_optimal_tib_size(dev); - dev->compressed_formats = panfrost_query_compressed_formats(fd); - dev->tiler_features = panfrost_query_tiler_features(fd); - dev->has_afbc = panfrost_query_afbc(fd, dev->arch); + dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range); + dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch); + dev->optimal_tib_size = panfrost_query_optimal_tib_size(dev); + dev->compressed_formats = panfrost_query_compressed_formats(fd); + dev->tiler_features = panfrost_query_tiler_features(fd); + dev->has_afbc = panfrost_query_afbc(fd, dev->arch); - if (dev->arch <= 6) - dev->formats = panfrost_pipe_format_v6; - else if (dev->arch <= 7) - dev->formats = panfrost_pipe_format_v7; - else - dev->formats = panfrost_pipe_format_v9; + if (dev->arch <= 6) + dev->formats = panfrost_pipe_format_v6; + else if (dev->arch <= 7) + dev->formats = panfrost_pipe_format_v7; + else + dev->formats = panfrost_pipe_format_v9; - util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512); + util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512); - pthread_mutex_init(&dev->bo_cache.lock, NULL); - list_inithead(&dev->bo_cache.lru); + pthread_mutex_init(&dev->bo_cache.lock, NULL); + list_inithead(&dev->bo_cache.lru); - for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) - list_inithead(&dev->bo_cache.buckets[i]); + for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) + list_inithead(&dev->bo_cache.buckets[i]); - /* Initialize pandecode before we start allocating */ - if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) - pandecode_initialize(!(dev->debug & PAN_DBG_TRACE)); + /* Initialize pandecode before we start allocating */ + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) + pandecode_initialize(!(dev->debug & PAN_DBG_TRACE)); - /* Tiler heap is internally required by the tiler, which can only be - * active for a single job chain at once, so a single heap can be - * shared across batches/contextes */ + /* Tiler heap is internally required by the tiler, which can only be + * active for a single job chain at once, so a single heap can be + * shared across batches/contextes */ - dev->tiler_heap = panfrost_bo_create(dev, 128 * 1024 * 1024, - PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap"); + dev->tiler_heap = panfrost_bo_create( + dev, 128 * 1024 * 1024, PAN_BO_INVISIBLE | PAN_BO_GROWABLE, "Tiler heap"); - pthread_mutex_init(&dev->submit_lock, NULL); + pthread_mutex_init(&dev->submit_lock, NULL); - /* Done once on init */ - panfrost_upload_sample_positions(dev); + /* Done once on init */ + panfrost_upload_sample_positions(dev); } void panfrost_close_device(struct panfrost_device *dev) { - /* If we don't recognize the model, the rest of the device won't exist, - * we will have early-exited the device open. - */ - if (dev->model) { - pthread_mutex_destroy(&dev->submit_lock); - panfrost_bo_unreference(dev->tiler_heap); - panfrost_bo_cache_evict_all(dev); - pthread_mutex_destroy(&dev->bo_cache.lock); - util_sparse_array_finish(&dev->bo_map); - } + /* If we don't recognize the model, the rest of the device won't exist, + * we will have early-exited the device open. + */ + if (dev->model) { + pthread_mutex_destroy(&dev->submit_lock); + panfrost_bo_unreference(dev->tiler_heap); + panfrost_bo_cache_evict_all(dev); + pthread_mutex_destroy(&dev->bo_cache.lock); + util_sparse_array_finish(&dev->bo_map); + } - drmFreeVersion(dev->kernel_version); - close(dev->fd); + drmFreeVersion(dev->kernel_version); + close(dev->fd); } diff --git a/src/panfrost/lib/pan_samples.c b/src/panfrost/lib/pan_samples.c index 2a9de8798cf..18205f2e3e6 100644 --- a/src/panfrost/lib/pan_samples.c +++ b/src/panfrost/lib/pan_samples.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "pan_device.h" #include "pan_bo.h" +#include "pan_device.h" /* Sample positions are specified partially in hardware, partially in software * on Mali. On Midgard, sample positions are completely fixed but need to be @@ -40,25 +40,25 @@ */ struct mali_sample_position { - uint16_t x, y; + uint16_t x, y; } __attribute__((packed)); struct mali_sample_positions { - struct mali_sample_position positions[32]; - struct mali_sample_position origin; - struct mali_sample_position padding[64 - (32 + 1)]; + struct mali_sample_position positions[32]; + struct mali_sample_position origin; + struct mali_sample_position padding[64 - (32 + 1)]; } __attribute__((packed)); /* SAMPLE16 constructs a single sample in terms of 1/16's of the grid, centered * at the origin. SAMPLE4/8 swap the units for legibility. */ -#define SAMPLE16(x, y) { \ - (((x) + 8) * (256 / 16)), \ - (((y) + 8) * (256 / 16)) \ -} +#define SAMPLE16(x, y) \ + { \ + (((x) + 8) * (256 / 16)), (((y) + 8) * (256 / 16)) \ + } -#define SAMPLE8(x, y) SAMPLE16((x) * 2, (y) * 2) -#define SAMPLE4(x, y) SAMPLE16((x) * 4, (y) * 4) +#define SAMPLE8(x, y) SAMPLE16((x)*2, (y)*2) +#define SAMPLE4(x, y) SAMPLE16((x)*4, (y)*4) /* clang-format off */ const struct mali_sample_positions sample_position_lut[] = { @@ -129,34 +129,32 @@ const struct mali_sample_positions sample_position_lut[] = { mali_ptr panfrost_sample_positions(const struct panfrost_device *dev, - enum mali_sample_pattern pattern) + enum mali_sample_pattern pattern) { - assert(pattern < ARRAY_SIZE(sample_position_lut)); - unsigned offset = (pattern * sizeof(sample_position_lut[0])); - return dev->sample_positions->ptr.gpu + offset; + assert(pattern < ARRAY_SIZE(sample_position_lut)); + unsigned offset = (pattern * sizeof(sample_position_lut[0])); + return dev->sample_positions->ptr.gpu + offset; } void panfrost_upload_sample_positions(struct panfrost_device *dev) { - STATIC_ASSERT(sizeof(sample_position_lut) < 4096); - dev->sample_positions = panfrost_bo_create(dev, 4096, 0, "Sample positions"); + STATIC_ASSERT(sizeof(sample_position_lut) < 4096); + dev->sample_positions = panfrost_bo_create(dev, 4096, 0, "Sample positions"); - memcpy(dev->sample_positions->ptr.cpu, sample_position_lut, - sizeof(sample_position_lut)); + memcpy(dev->sample_positions->ptr.cpu, sample_position_lut, + sizeof(sample_position_lut)); } /* CPU side LUT query, to implement glGetMultisamplefv */ void -panfrost_query_sample_position( - enum mali_sample_pattern pattern, - unsigned sample_idx, - float *out) +panfrost_query_sample_position(enum mali_sample_pattern pattern, + unsigned sample_idx, float *out) { - struct mali_sample_position pos = - sample_position_lut[pattern].positions[sample_idx]; + struct mali_sample_position pos = + sample_position_lut[pattern].positions[sample_idx]; - out[0] = DECODE_FIXED_16(pos.x); - out[1] = DECODE_FIXED_16(pos.y); + out[0] = DECODE_FIXED_16(pos.x); + out[1] = DECODE_FIXED_16(pos.y); } diff --git a/src/panfrost/lib/pan_scoreboard.h b/src/panfrost/lib/pan_scoreboard.h index f6476c66651..4cd4c46fb48 100644 --- a/src/panfrost/lib/pan_scoreboard.h +++ b/src/panfrost/lib/pan_scoreboard.h @@ -31,27 +31,27 @@ #include "pan_pool.h" struct pan_scoreboard { - /* The first job in the batch */ - mali_ptr first_job; + /* The first job in the batch */ + mali_ptr first_job; - /* The number of jobs in the primary batch, essentially */ - unsigned job_index; + /* The number of jobs in the primary batch, essentially */ + unsigned job_index; - /* A CPU-side pointer to the previous job for next_job linking */ - struct mali_job_header_packed *prev_job; + /* A CPU-side pointer to the previous job for next_job linking */ + struct mali_job_header_packed *prev_job; - /* A CPU-side pointer to the first tiler job for dep updates when - * injecting a reload tiler job. - */ - struct mali_job_header_packed *first_tiler; - uint32_t first_tiler_dep1; + /* A CPU-side pointer to the first tiler job for dep updates when + * injecting a reload tiler job. + */ + struct mali_job_header_packed *first_tiler; + uint32_t first_tiler_dep1; - /* The dependency for tiler jobs (i.e. the index of the last emitted - * tiler job, or zero if none have been emitted) */ - unsigned tiler_dep; + /* The dependency for tiler jobs (i.e. the index of the last emitted + * tiler job, or zero if none have been emitted) */ + unsigned tiler_dep; - /* The job index of the WRITE_VALUE job (before it has been created) */ - unsigned write_value_index; + /* The job index of the WRITE_VALUE job (before it has been created) */ + unsigned write_value_index; }; #ifdef PAN_ARCH @@ -132,16 +132,16 @@ static bool panfrost_job_uses_tiling(enum mali_job_type type) { #if PAN_ARCH >= 9 - if (type == MALI_JOB_TYPE_MALLOC_VERTEX) - return true; + if (type == MALI_JOB_TYPE_MALLOC_VERTEX) + return true; #endif #if PAN_ARCH >= 6 - if (type == MALI_JOB_TYPE_INDEXED_VERTEX) - return true; + if (type == MALI_JOB_TYPE_INDEXED_VERTEX) + return true; #endif - return (type == MALI_JOB_TYPE_TILER); + return (type == MALI_JOB_TYPE_TILER); } /* Generates, uploads, and queues a a new job. All fields are written in order @@ -154,83 +154,80 @@ panfrost_job_uses_tiling(enum mali_job_type type) * not wallpapering and set this, dragons will eat you. */ static inline unsigned -panfrost_add_job(struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - enum mali_job_type type, - bool barrier, bool suppress_prefetch, +panfrost_add_job(struct pan_pool *pool, struct pan_scoreboard *scoreboard, + enum mali_job_type type, bool barrier, bool suppress_prefetch, unsigned local_dep, unsigned global_dep, - const struct panfrost_ptr *job, - bool inject) + const struct panfrost_ptr *job, bool inject) { - if (panfrost_job_uses_tiling(type)) { - /* Tiler jobs must be chained, and on Midgard, the first tiler - * job must depend on the write value job, whose index we - * reserve now */ + if (panfrost_job_uses_tiling(type)) { + /* Tiler jobs must be chained, and on Midgard, the first tiler + * job must depend on the write value job, whose index we + * reserve now */ - if (PAN_ARCH <= 5 && !scoreboard->write_value_index) - scoreboard->write_value_index = ++scoreboard->job_index; + if (PAN_ARCH <= 5 && !scoreboard->write_value_index) + scoreboard->write_value_index = ++scoreboard->job_index; - if (scoreboard->tiler_dep && !inject) - global_dep = scoreboard->tiler_dep; - else if (PAN_ARCH <= 5) - global_dep = scoreboard->write_value_index; - } + if (scoreboard->tiler_dep && !inject) + global_dep = scoreboard->tiler_dep; + else if (PAN_ARCH <= 5) + global_dep = scoreboard->write_value_index; + } - /* Assign the index */ - unsigned index = ++scoreboard->job_index; + /* Assign the index */ + unsigned index = ++scoreboard->job_index; - pan_pack(job->cpu, JOB_HEADER, header) { - header.type = type; - header.barrier = barrier; - header.suppress_prefetch = suppress_prefetch; - header.index = index; - header.dependency_1 = local_dep; - header.dependency_2 = global_dep; + pan_pack(job->cpu, JOB_HEADER, header) { + header.type = type; + header.barrier = barrier; + header.suppress_prefetch = suppress_prefetch; + header.index = index; + header.dependency_1 = local_dep; + header.dependency_2 = global_dep; - if (inject) - header.next = scoreboard->first_job; - } + if (inject) + header.next = scoreboard->first_job; + } - if (inject) { - assert(type == MALI_JOB_TYPE_TILER && "only for blit shaders"); + if (inject) { + assert(type == MALI_JOB_TYPE_TILER && "only for blit shaders"); - if (scoreboard->first_tiler) { - /* Manual update of the dep2 field. This is bad, - * don't copy this pattern. - */ - scoreboard->first_tiler->opaque[5] = - scoreboard->first_tiler_dep1 | (index << 16); - } + if (scoreboard->first_tiler) { + /* Manual update of the dep2 field. This is bad, + * don't copy this pattern. + */ + scoreboard->first_tiler->opaque[5] = + scoreboard->first_tiler_dep1 | (index << 16); + } - scoreboard->first_tiler = (void *)job->cpu; - scoreboard->first_tiler_dep1 = local_dep; - scoreboard->first_job = job->gpu; - return index; - } + scoreboard->first_tiler = (void *)job->cpu; + scoreboard->first_tiler_dep1 = local_dep; + scoreboard->first_job = job->gpu; + return index; + } - /* Form a chain */ - if (panfrost_job_uses_tiling(type)) { - if (!scoreboard->first_tiler) { - scoreboard->first_tiler = (void *)job->cpu; - scoreboard->first_tiler_dep1 = local_dep; - } - scoreboard->tiler_dep = index; - } + /* Form a chain */ + if (panfrost_job_uses_tiling(type)) { + if (!scoreboard->first_tiler) { + scoreboard->first_tiler = (void *)job->cpu; + scoreboard->first_tiler_dep1 = local_dep; + } + scoreboard->tiler_dep = index; + } - if (scoreboard->prev_job) { - /* Manual update of the next pointer. This is bad, don't copy - * this pattern. - * TODO: Find a way to defer last job header emission until we - * have a new job to queue or the batch is ready for execution. - */ - scoreboard->prev_job->opaque[6] = job->gpu; - scoreboard->prev_job->opaque[7] = job->gpu >> 32; - } else { - scoreboard->first_job = job->gpu; - } + if (scoreboard->prev_job) { + /* Manual update of the next pointer. This is bad, don't copy + * this pattern. + * TODO: Find a way to defer last job header emission until we + * have a new job to queue or the batch is ready for execution. + */ + scoreboard->prev_job->opaque[6] = job->gpu; + scoreboard->prev_job->opaque[7] = job->gpu >> 32; + } else { + scoreboard->first_job = job->gpu; + } - scoreboard->prev_job = (struct mali_job_header_packed *)job->cpu; - return index; + scoreboard->prev_job = (struct mali_job_header_packed *)job->cpu; + return index; } /* Generates a write value job, used to initialize the tiler structures. Note @@ -241,30 +238,30 @@ panfrost_scoreboard_initialize_tiler(struct pan_pool *pool, struct pan_scoreboard *scoreboard, mali_ptr polygon_list) { - struct panfrost_ptr transfer = { 0 }; + struct panfrost_ptr transfer = {0}; - /* Check if we even need tiling */ - if (PAN_ARCH >= 6 || !scoreboard->first_tiler) - return transfer; + /* Check if we even need tiling */ + if (PAN_ARCH >= 6 || !scoreboard->first_tiler) + return transfer; - /* Okay, we do. Let's generate it. We'll need the job's polygon list - * regardless of size. */ + /* Okay, we do. Let's generate it. We'll need the job's polygon list + * regardless of size. */ - transfer = pan_pool_alloc_desc(pool, WRITE_VALUE_JOB); + transfer = pan_pool_alloc_desc(pool, WRITE_VALUE_JOB); - pan_section_pack(transfer.cpu, WRITE_VALUE_JOB, HEADER, header) { - header.type = MALI_JOB_TYPE_WRITE_VALUE; - header.index = scoreboard->write_value_index; - header.next = scoreboard->first_job; - } + pan_section_pack(transfer.cpu, WRITE_VALUE_JOB, HEADER, header) { + header.type = MALI_JOB_TYPE_WRITE_VALUE; + header.index = scoreboard->write_value_index; + header.next = scoreboard->first_job; + } - pan_section_pack(transfer.cpu, WRITE_VALUE_JOB, PAYLOAD, payload) { - payload.address = polygon_list; - payload.type = MALI_WRITE_VALUE_TYPE_ZERO; - } + pan_section_pack(transfer.cpu, WRITE_VALUE_JOB, PAYLOAD, payload) { + payload.address = polygon_list; + payload.type = MALI_WRITE_VALUE_TYPE_ZERO; + } - scoreboard->first_job = transfer.gpu; - return transfer; + scoreboard->first_job = transfer.gpu; + return transfer; } #endif /* PAN_ARCH */ diff --git a/src/panfrost/lib/pan_scratch.c b/src/panfrost/lib/pan_scratch.c index 91d8bd65564..9e687ba173e 100644 --- a/src/panfrost/lib/pan_scratch.c +++ b/src/panfrost/lib/pan_scratch.c @@ -24,8 +24,8 @@ * Alyssa Rosenzweig */ -#include "util/u_math.h" #include "util/macros.h" +#include "util/u_math.h" #include "pan_encoder.h" /* Midgard has a small register file, so shaders with high register pressure @@ -66,22 +66,21 @@ unsigned panfrost_get_stack_shift(unsigned stack_size) { - if (stack_size) - return util_logbase2_ceil(DIV_ROUND_UP(stack_size, 16)); - else - return 0; + if (stack_size) + return util_logbase2_ceil(DIV_ROUND_UP(stack_size, 16)); + else + return 0; } /* Computes the aligned stack size given the shift and thread count. */ unsigned -panfrost_get_total_stack_size( - unsigned thread_size, - unsigned threads_per_core, - unsigned core_id_range) +panfrost_get_total_stack_size(unsigned thread_size, unsigned threads_per_core, + unsigned core_id_range) { - unsigned size_per_thread = (thread_size == 0) ? 0 : - util_next_power_of_two(ALIGN_POT(thread_size, 16)); + unsigned size_per_thread = + (thread_size == 0) ? 0 + : util_next_power_of_two(ALIGN_POT(thread_size, 16)); - return size_per_thread * threads_per_core * core_id_range; + return size_per_thread * threads_per_core * core_id_range; } diff --git a/src/panfrost/lib/pan_shader.c b/src/panfrost/lib/pan_shader.c index 73c00befe76..b956183ce7b 100644 --- a/src/panfrost/lib/pan_shader.c +++ b/src/panfrost/lib/pan_shader.c @@ -22,8 +22,8 @@ * SOFTWARE. */ -#include "pan_device.h" #include "pan_shader.h" +#include "pan_device.h" #include "pan_format.h" #if PAN_ARCH <= 5 @@ -36,9 +36,9 @@ const nir_shader_compiler_options * GENX(pan_shader_get_compiler_options)(void) { #if PAN_ARCH >= 6 - return &bifrost_nir_options; + return &bifrost_nir_options; #else - return &midgard_nir_options; + return &midgard_nir_options; #endif } @@ -46,177 +46,172 @@ GENX(pan_shader_get_compiler_options)(void) static enum mali_register_file_format bifrost_blend_type_from_nir(nir_alu_type nir_type) { - switch(nir_type) { - case 0: /* Render target not in use */ - return 0; - case nir_type_float16: - return MALI_REGISTER_FILE_FORMAT_F16; - case nir_type_float32: - return MALI_REGISTER_FILE_FORMAT_F32; - case nir_type_int32: - return MALI_REGISTER_FILE_FORMAT_I32; - case nir_type_uint32: - return MALI_REGISTER_FILE_FORMAT_U32; - case nir_type_int16: - return MALI_REGISTER_FILE_FORMAT_I16; - case nir_type_uint16: - return MALI_REGISTER_FILE_FORMAT_U16; - default: - unreachable("Unsupported blend shader type for NIR alu type"); - return 0; - } + switch (nir_type) { + case 0: /* Render target not in use */ + return 0; + case nir_type_float16: + return MALI_REGISTER_FILE_FORMAT_F16; + case nir_type_float32: + return MALI_REGISTER_FILE_FORMAT_F32; + case nir_type_int32: + return MALI_REGISTER_FILE_FORMAT_I32; + case nir_type_uint32: + return MALI_REGISTER_FILE_FORMAT_U32; + case nir_type_int16: + return MALI_REGISTER_FILE_FORMAT_I16; + case nir_type_uint16: + return MALI_REGISTER_FILE_FORMAT_U16; + default: + unreachable("Unsupported blend shader type for NIR alu type"); + return 0; + } } #if PAN_ARCH <= 7 enum mali_register_file_format GENX(pan_fixup_blend_type)(nir_alu_type T_size, enum pipe_format format) { - const struct util_format_description *desc = util_format_description(format); - unsigned size = nir_alu_type_get_type_size(T_size); - nir_alu_type T_format = pan_unpacked_type_for_format(desc); - nir_alu_type T = nir_alu_type_get_base_type(T_format) | size; + const struct util_format_description *desc = util_format_description(format); + unsigned size = nir_alu_type_get_type_size(T_size); + nir_alu_type T_format = pan_unpacked_type_for_format(desc); + nir_alu_type T = nir_alu_type_get_base_type(T_format) | size; - return bifrost_blend_type_from_nir(T); + return bifrost_blend_type_from_nir(T); } #endif #endif void -GENX(pan_shader_compile)(nir_shader *s, - struct panfrost_compile_inputs *inputs, +GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs, struct util_dynarray *binary, struct pan_shader_info *info) { - memset(info, 0, sizeof(*info)); + memset(info, 0, sizeof(*info)); #if PAN_ARCH >= 6 - bifrost_compile_shader_nir(s, inputs, binary, info); + bifrost_compile_shader_nir(s, inputs, binary, info); #else - for (unsigned i = 0; i < ARRAY_SIZE(inputs->rt_formats); i++) { - enum pipe_format fmt = inputs->rt_formats[i]; - unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback; + for (unsigned i = 0; i < ARRAY_SIZE(inputs->rt_formats); i++) { + enum pipe_format fmt = inputs->rt_formats[i]; + unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback; - if (wb_fmt < MALI_COLOR_FORMAT_R8) - inputs->raw_fmt_mask |= BITFIELD_BIT(i); - } + if (wb_fmt < MALI_COLOR_FORMAT_R8) + inputs->raw_fmt_mask |= BITFIELD_BIT(i); + } - midgard_compile_shader_nir(s, inputs, binary, info); + midgard_compile_shader_nir(s, inputs, binary, info); #endif - info->stage = s->info.stage; - info->contains_barrier = s->info.uses_memory_barrier || - s->info.uses_control_barrier; - info->separable = s->info.separate_shader; + info->stage = s->info.stage; + info->contains_barrier = + s->info.uses_memory_barrier || s->info.uses_control_barrier; + info->separable = s->info.separate_shader; - switch (info->stage) { - case MESA_SHADER_VERTEX: - info->attributes_read = s->info.inputs_read; - info->attributes_read_count = util_bitcount64(info->attributes_read); - info->attribute_count = info->attributes_read_count; + switch (info->stage) { + case MESA_SHADER_VERTEX: + info->attributes_read = s->info.inputs_read; + info->attributes_read_count = util_bitcount64(info->attributes_read); + info->attribute_count = info->attributes_read_count; #if PAN_ARCH <= 5 - bool vertex_id = BITSET_TEST(s->info.system_values_read, - SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); - if (vertex_id) - info->attribute_count = MAX2(info->attribute_count, PAN_VERTEX_ID + 1); + bool vertex_id = BITSET_TEST(s->info.system_values_read, + SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); + if (vertex_id) + info->attribute_count = MAX2(info->attribute_count, PAN_VERTEX_ID + 1); - bool instance_id = BITSET_TEST(s->info.system_values_read, - SYSTEM_VALUE_INSTANCE_ID); - if (instance_id) - info->attribute_count = MAX2(info->attribute_count, PAN_INSTANCE_ID + 1); + bool instance_id = + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); + if (instance_id) + info->attribute_count = + MAX2(info->attribute_count, PAN_INSTANCE_ID + 1); #endif - info->vs.writes_point_size = - s->info.outputs_written & (1 << VARYING_SLOT_PSIZ); + info->vs.writes_point_size = + s->info.outputs_written & (1 << VARYING_SLOT_PSIZ); #if PAN_ARCH >= 9 - info->varyings.output_count = - util_last_bit(s->info.outputs_written >> VARYING_SLOT_VAR0); + info->varyings.output_count = + util_last_bit(s->info.outputs_written >> VARYING_SLOT_VAR0); #endif - break; - case MESA_SHADER_FRAGMENT: - if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) - info->fs.writes_depth = true; - if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) - info->fs.writes_stencil = true; - if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) - info->fs.writes_coverage = true; + break; + case MESA_SHADER_FRAGMENT: + if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) + info->fs.writes_depth = true; + if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) + info->fs.writes_stencil = true; + if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) + info->fs.writes_coverage = true; - info->fs.outputs_read = s->info.outputs_read >> FRAG_RESULT_DATA0; - info->fs.outputs_written = s->info.outputs_written >> FRAG_RESULT_DATA0; - info->fs.sample_shading = s->info.fs.uses_sample_shading; - info->fs.untyped_color_outputs = s->info.fs.untyped_color_outputs; + info->fs.outputs_read = s->info.outputs_read >> FRAG_RESULT_DATA0; + info->fs.outputs_written = s->info.outputs_written >> FRAG_RESULT_DATA0; + info->fs.sample_shading = s->info.fs.uses_sample_shading; + info->fs.untyped_color_outputs = s->info.fs.untyped_color_outputs; - info->fs.can_discard = s->info.fs.uses_discard; - info->fs.early_fragment_tests = s->info.fs.early_fragment_tests; + info->fs.can_discard = s->info.fs.uses_discard; + info->fs.early_fragment_tests = s->info.fs.early_fragment_tests; - /* List of reasons we need to execute frag shaders when things - * are masked off */ + /* List of reasons we need to execute frag shaders when things + * are masked off */ - info->fs.sidefx = s->info.writes_memory || - s->info.fs.uses_discard || - s->info.fs.uses_demote; + info->fs.sidefx = s->info.writes_memory || s->info.fs.uses_discard || + s->info.fs.uses_demote; - /* With suitable ZSA/blend, is early-z possible? */ - info->fs.can_early_z = - !info->fs.sidefx && - !info->fs.writes_depth && - !info->fs.writes_stencil && - !info->fs.writes_coverage; + /* With suitable ZSA/blend, is early-z possible? */ + info->fs.can_early_z = !info->fs.sidefx && !info->fs.writes_depth && + !info->fs.writes_stencil && + !info->fs.writes_coverage; - /* Similiarly with suitable state, is FPK possible? */ - info->fs.can_fpk = - !info->fs.writes_depth && - !info->fs.writes_stencil && - !info->fs.writes_coverage && - !info->fs.can_discard && - !info->fs.outputs_read; + /* Similiarly with suitable state, is FPK possible? */ + info->fs.can_fpk = !info->fs.writes_depth && !info->fs.writes_stencil && + !info->fs.writes_coverage && !info->fs.can_discard && + !info->fs.outputs_read; - /* Requires the same hardware guarantees, so grouped as one bit - * in the hardware. - */ - info->contains_barrier |= s->info.fs.needs_quad_helper_invocations; + /* Requires the same hardware guarantees, so grouped as one bit + * in the hardware. + */ + info->contains_barrier |= s->info.fs.needs_quad_helper_invocations; - info->fs.reads_frag_coord = - (s->info.inputs_read & (1 << VARYING_SLOT_POS)) || - BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); - info->fs.reads_point_coord = - s->info.inputs_read & (1 << VARYING_SLOT_PNTC); - info->fs.reads_face = - (s->info.inputs_read & (1 << VARYING_SLOT_FACE)) || - BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE); + info->fs.reads_frag_coord = + (s->info.inputs_read & (1 << VARYING_SLOT_POS)) || + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); + info->fs.reads_point_coord = + s->info.inputs_read & (1 << VARYING_SLOT_PNTC); + info->fs.reads_face = + (s->info.inputs_read & (1 << VARYING_SLOT_FACE)) || + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE); #if PAN_ARCH >= 9 - info->varyings.output_count = - util_last_bit(s->info.outputs_read >> VARYING_SLOT_VAR0); + info->varyings.output_count = + util_last_bit(s->info.outputs_read >> VARYING_SLOT_VAR0); #endif - break; - default: - /* Everything else treated as compute */ - info->wls_size = s->info.shared_size; - break; - } + break; + default: + /* Everything else treated as compute */ + info->wls_size = s->info.shared_size; + break; + } - info->outputs_written = s->info.outputs_written; + info->outputs_written = s->info.outputs_written; - /* Sysvals have dedicated UBO */ - info->ubo_count = s->info.num_ubos; - if (info->sysvals.sysval_count && inputs->fixed_sysval_ubo < 0) - info->ubo_count++; + /* Sysvals have dedicated UBO */ + info->ubo_count = s->info.num_ubos; + if (info->sysvals.sysval_count && inputs->fixed_sysval_ubo < 0) + info->ubo_count++; - info->attribute_count += BITSET_LAST_BIT(s->info.images_used); - info->writes_global = s->info.writes_memory; + info->attribute_count += BITSET_LAST_BIT(s->info.images_used); + info->writes_global = s->info.writes_memory; - info->sampler_count = info->texture_count = BITSET_LAST_BIT(s->info.textures_used); + info->sampler_count = info->texture_count = + BITSET_LAST_BIT(s->info.textures_used); - unsigned execution_mode = s->info.float_controls_execution_mode; - info->ftz_fp16 = nir_is_denorm_flush_to_zero(execution_mode, 16); - info->ftz_fp32 = nir_is_denorm_flush_to_zero(execution_mode, 32); + unsigned execution_mode = s->info.float_controls_execution_mode; + info->ftz_fp16 = nir_is_denorm_flush_to_zero(execution_mode, 16); + info->ftz_fp32 = nir_is_denorm_flush_to_zero(execution_mode, 32); #if PAN_ARCH >= 6 - /* This is "redundant" information, but is needed in a draw-time hot path */ - for (unsigned i = 0; i < ARRAY_SIZE(info->bifrost.blend); ++i) { - info->bifrost.blend[i].format = - bifrost_blend_type_from_nir(info->bifrost.blend[i].type); - } + /* This is "redundant" information, but is needed in a draw-time hot path */ + for (unsigned i = 0; i < ARRAY_SIZE(info->bifrost.blend); ++i) { + info->bifrost.blend[i].format = + bifrost_blend_type_from_nir(info->bifrost.blend[i].type); + } #endif } diff --git a/src/panfrost/lib/pan_shader.h b/src/panfrost/lib/pan_shader.h index 223f52e4af4..406db3d37ce 100644 --- a/src/panfrost/lib/pan_shader.h +++ b/src/panfrost/lib/pan_shader.h @@ -29,38 +29,36 @@ #include "panfrost/util/pan_ir.h" #include "panfrost/util/pan_lower_framebuffer.h" -#include "pan_device.h" #include "genxml/gen_macros.h" +#include "pan_device.h" struct panfrost_device; #ifdef PAN_ARCH -const nir_shader_compiler_options * -GENX(pan_shader_get_compiler_options)(void); +const nir_shader_compiler_options *GENX(pan_shader_get_compiler_options)(void); -void -GENX(pan_shader_compile)(nir_shader *nir, - struct panfrost_compile_inputs *inputs, - struct util_dynarray *binary, - struct pan_shader_info *info); +void GENX(pan_shader_compile)(nir_shader *nir, + struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info); #if PAN_ARCH >= 6 && PAN_ARCH <= 7 enum mali_register_file_format -GENX(pan_fixup_blend_type)(nir_alu_type T_size, enum pipe_format format); + GENX(pan_fixup_blend_type)(nir_alu_type T_size, enum pipe_format format); #endif #if PAN_ARCH >= 9 static inline enum mali_shader_stage pan_shader_stage(const struct pan_shader_info *info) { - switch (info->stage) { - case MESA_SHADER_VERTEX: - return MALI_SHADER_STAGE_VERTEX; - case MESA_SHADER_FRAGMENT: - return MALI_SHADER_STAGE_FRAGMENT; - default: - return MALI_SHADER_STAGE_COMPUTE; - } + switch (info->stage) { + case MESA_SHADER_VERTEX: + return MALI_SHADER_STAGE_VERTEX; + case MESA_SHADER_FRAGMENT: + return MALI_SHADER_STAGE_FRAGMENT; + default: + return MALI_SHADER_STAGE_COMPUTE; + } } #endif @@ -68,17 +66,17 @@ pan_shader_stage(const struct pan_shader_info *info) static inline enum mali_shader_register_allocation pan_register_allocation(unsigned work_reg_count) { - return (work_reg_count <= 32) ? - MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD : - MALI_SHADER_REGISTER_ALLOCATION_64_PER_THREAD; + return (work_reg_count <= 32) + ? MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD + : MALI_SHADER_REGISTER_ALLOCATION_64_PER_THREAD; } #endif static inline enum mali_depth_source pan_depth_source(const struct pan_shader_info *info) { - return info->fs.writes_depth ? MALI_DEPTH_SOURCE_SHADER : - MALI_DEPTH_SOURCE_FIXED_FUNCTION; + return info->fs.writes_depth ? MALI_DEPTH_SOURCE_SHADER + : MALI_DEPTH_SOURCE_FIXED_FUNCTION; } #if PAN_ARCH <= 7 @@ -87,24 +85,22 @@ static inline void pan_shader_prepare_midgard_rsd(const struct pan_shader_info *info, struct MALI_RENDERER_STATE *rsd) { - assert((info->push.count & 3) == 0); + assert((info->push.count & 3) == 0); - rsd->properties.uniform_count = info->push.count / 4; - rsd->properties.shader_has_side_effects = info->writes_global; - rsd->properties.fp_mode = MALI_FP_MODE_GL_INF_NAN_ALLOWED; + rsd->properties.uniform_count = info->push.count / 4; + rsd->properties.shader_has_side_effects = info->writes_global; + rsd->properties.fp_mode = MALI_FP_MODE_GL_INF_NAN_ALLOWED; - /* For fragment shaders, work register count, early-z, reads at draw-time */ + /* For fragment shaders, work register count, early-z, reads at draw-time */ - if (info->stage != MESA_SHADER_FRAGMENT) { - rsd->properties.work_register_count = info->work_reg_count; - } else { - rsd->properties.shader_reads_tilebuffer = - info->fs.outputs_read; + if (info->stage != MESA_SHADER_FRAGMENT) { + rsd->properties.work_register_count = info->work_reg_count; + } else { + rsd->properties.shader_reads_tilebuffer = info->fs.outputs_read; - /* However, forcing early-z in the shader overrides draw-time */ - rsd->properties.force_early_z = - info->fs.early_fragment_tests; - } + /* However, forcing early-z in the shader overrides draw-time */ + rsd->properties.force_early_z = info->fs.early_fragment_tests; + } } #else @@ -112,37 +108,36 @@ pan_shader_prepare_midgard_rsd(const struct pan_shader_info *info, #define pan_preloads(reg) (preload & BITFIELD64_BIT(reg)) static void -pan_make_preload(gl_shader_stage stage, - uint64_t preload, +pan_make_preload(gl_shader_stage stage, uint64_t preload, struct MALI_PRELOAD *out) { - switch (stage) { - case MESA_SHADER_VERTEX: - out->vertex.position_result_address_lo = pan_preloads(58); - out->vertex.position_result_address_hi = pan_preloads(59); - out->vertex.vertex_id = pan_preloads(61); - out->vertex.instance_id = pan_preloads(62); - break; + switch (stage) { + case MESA_SHADER_VERTEX: + out->vertex.position_result_address_lo = pan_preloads(58); + out->vertex.position_result_address_hi = pan_preloads(59); + out->vertex.vertex_id = pan_preloads(61); + out->vertex.instance_id = pan_preloads(62); + break; - case MESA_SHADER_FRAGMENT: - out->fragment.primitive_id = pan_preloads(57); - out->fragment.primitive_flags = pan_preloads(58); - out->fragment.fragment_position = pan_preloads(59); - out->fragment.sample_mask_id = pan_preloads(61); - out->fragment.coverage = true; - break; + case MESA_SHADER_FRAGMENT: + out->fragment.primitive_id = pan_preloads(57); + out->fragment.primitive_flags = pan_preloads(58); + out->fragment.fragment_position = pan_preloads(59); + out->fragment.sample_mask_id = pan_preloads(61); + out->fragment.coverage = true; + break; - default: - out->compute.local_invocation_xy = pan_preloads(55); - out->compute.local_invocation_z = pan_preloads(56); - out->compute.work_group_x = pan_preloads(57); - out->compute.work_group_y = pan_preloads(58); - out->compute.work_group_z = pan_preloads(59); - out->compute.global_invocation_x = pan_preloads(60); - out->compute.global_invocation_y = pan_preloads(61); - out->compute.global_invocation_z = pan_preloads(62); - break; - } + default: + out->compute.local_invocation_xy = pan_preloads(55); + out->compute.local_invocation_z = pan_preloads(56); + out->compute.work_group_x = pan_preloads(57); + out->compute.work_group_y = pan_preloads(58); + out->compute.work_group_z = pan_preloads(59); + out->compute.global_invocation_x = pan_preloads(60); + out->compute.global_invocation_y = pan_preloads(61); + out->compute.global_invocation_z = pan_preloads(62); + break; + } } #if PAN_ARCH == 7 @@ -150,25 +145,25 @@ static inline void pan_pack_message_preload(struct MALI_MESSAGE_PRELOAD *cfg, const struct bifrost_message_preload *msg) { - enum mali_message_preload_register_format regfmt = msg->fp16 ? - MALI_MESSAGE_PRELOAD_REGISTER_FORMAT_F16 : - MALI_MESSAGE_PRELOAD_REGISTER_FORMAT_F32; + enum mali_message_preload_register_format regfmt = + msg->fp16 ? MALI_MESSAGE_PRELOAD_REGISTER_FORMAT_F16 + : MALI_MESSAGE_PRELOAD_REGISTER_FORMAT_F32; - if (msg->enabled && msg->texture) { - cfg->type = MALI_MESSAGE_TYPE_VAR_TEX; - cfg->var_tex.varying_index = msg->varying_index; - cfg->var_tex.texture_index = msg->texture_index; - cfg->var_tex.register_format = regfmt; - cfg->var_tex.skip = msg->skip; - cfg->var_tex.zero_lod = msg->zero_lod; - } else if (msg->enabled) { - cfg->type = MALI_MESSAGE_TYPE_LD_VAR; - cfg->ld_var.varying_index = msg->varying_index; - cfg->ld_var.register_format = regfmt; - cfg->ld_var.num_components = msg->num_components; - } else { - cfg->type = MALI_MESSAGE_TYPE_DISABLED; - } + if (msg->enabled && msg->texture) { + cfg->type = MALI_MESSAGE_TYPE_VAR_TEX; + cfg->var_tex.varying_index = msg->varying_index; + cfg->var_tex.texture_index = msg->texture_index; + cfg->var_tex.register_format = regfmt; + cfg->var_tex.skip = msg->skip; + cfg->var_tex.zero_lod = msg->zero_lod; + } else if (msg->enabled) { + cfg->type = MALI_MESSAGE_TYPE_LD_VAR; + cfg->ld_var.varying_index = msg->varying_index; + cfg->ld_var.register_format = regfmt; + cfg->ld_var.num_components = msg->num_components; + } else { + cfg->type = MALI_MESSAGE_TYPE_DISABLED; + } } #endif @@ -176,81 +171,79 @@ static inline void pan_shader_prepare_bifrost_rsd(const struct pan_shader_info *info, struct MALI_RENDERER_STATE *rsd) { - unsigned fau_count = DIV_ROUND_UP(info->push.count, 2); - rsd->preload.uniform_count = fau_count; + unsigned fau_count = DIV_ROUND_UP(info->push.count, 2); + rsd->preload.uniform_count = fau_count; #if PAN_ARCH >= 7 - rsd->properties.shader_register_allocation = - pan_register_allocation(info->work_reg_count); + rsd->properties.shader_register_allocation = + pan_register_allocation(info->work_reg_count); #endif - pan_make_preload(info->stage, info->preload, &rsd->preload); + pan_make_preload(info->stage, info->preload, &rsd->preload); - if (info->stage == MESA_SHADER_FRAGMENT) { - rsd->properties.shader_modifies_coverage = - info->fs.writes_coverage || info->fs.can_discard; + if (info->stage == MESA_SHADER_FRAGMENT) { + rsd->properties.shader_modifies_coverage = + info->fs.writes_coverage || info->fs.can_discard; - rsd->properties.allow_forward_pixel_to_be_killed = - !info->writes_global; + rsd->properties.allow_forward_pixel_to_be_killed = !info->writes_global; #if PAN_ARCH >= 7 - rsd->properties.shader_wait_dependency_6 = info->bifrost.wait_6; - rsd->properties.shader_wait_dependency_7 = info->bifrost.wait_7; + rsd->properties.shader_wait_dependency_6 = info->bifrost.wait_6; + rsd->properties.shader_wait_dependency_7 = info->bifrost.wait_7; - pan_pack_message_preload(&rsd->message_preload_1, &info->bifrost.messages[0]); - pan_pack_message_preload(&rsd->message_preload_2, &info->bifrost.messages[1]); + pan_pack_message_preload(&rsd->message_preload_1, + &info->bifrost.messages[0]); + pan_pack_message_preload(&rsd->message_preload_2, + &info->bifrost.messages[1]); #endif - } else if (info->stage == MESA_SHADER_VERTEX && info->vs.secondary_enable) { - rsd->secondary_preload.uniform_count = fau_count; + } else if (info->stage == MESA_SHADER_VERTEX && info->vs.secondary_enable) { + rsd->secondary_preload.uniform_count = fau_count; - pan_make_preload(info->stage, info->vs.secondary_preload, - &rsd->secondary_preload); + pan_make_preload(info->stage, info->vs.secondary_preload, + &rsd->secondary_preload); - rsd->secondary_shader = rsd->shader.shader + - info->vs.secondary_offset; + rsd->secondary_shader = rsd->shader.shader + info->vs.secondary_offset; #if PAN_ARCH >= 7 - rsd->properties.secondary_shader_register_allocation = - pan_register_allocation(info->vs.secondary_work_reg_count); + rsd->properties.secondary_shader_register_allocation = + pan_register_allocation(info->vs.secondary_work_reg_count); #endif - } + } } #endif static inline void pan_shader_prepare_rsd(const struct pan_shader_info *shader_info, - mali_ptr shader_ptr, - struct MALI_RENDERER_STATE *rsd) + mali_ptr shader_ptr, struct MALI_RENDERER_STATE *rsd) { #if PAN_ARCH <= 5 - shader_ptr |= shader_info->midgard.first_tag; + shader_ptr |= shader_info->midgard.first_tag; #endif - rsd->shader.shader = shader_ptr; - rsd->shader.attribute_count = shader_info->attribute_count; - rsd->shader.varying_count = shader_info->varyings.input_count + - shader_info->varyings.output_count; - rsd->shader.texture_count = shader_info->texture_count; - rsd->shader.sampler_count = shader_info->sampler_count; - rsd->properties.shader_contains_barrier = shader_info->contains_barrier; - rsd->properties.uniform_buffer_count = shader_info->ubo_count; + rsd->shader.shader = shader_ptr; + rsd->shader.attribute_count = shader_info->attribute_count; + rsd->shader.varying_count = + shader_info->varyings.input_count + shader_info->varyings.output_count; + rsd->shader.texture_count = shader_info->texture_count; + rsd->shader.sampler_count = shader_info->sampler_count; + rsd->properties.shader_contains_barrier = shader_info->contains_barrier; + rsd->properties.uniform_buffer_count = shader_info->ubo_count; - if (shader_info->stage == MESA_SHADER_FRAGMENT) { - rsd->properties.stencil_from_shader = - shader_info->fs.writes_stencil; - rsd->properties.depth_source = pan_depth_source(shader_info); + if (shader_info->stage == MESA_SHADER_FRAGMENT) { + rsd->properties.stencil_from_shader = shader_info->fs.writes_stencil; + rsd->properties.depth_source = pan_depth_source(shader_info); - /* This also needs to be set if the API forces per-sample - * shading, but that'll just got ORed in */ - rsd->multisample_misc.evaluate_per_sample = - shader_info->fs.sample_shading; - } + /* This also needs to be set if the API forces per-sample + * shading, but that'll just got ORed in */ + rsd->multisample_misc.evaluate_per_sample = + shader_info->fs.sample_shading; + } #if PAN_ARCH >= 6 - pan_shader_prepare_bifrost_rsd(shader_info, rsd); + pan_shader_prepare_bifrost_rsd(shader_info, rsd); #else - pan_shader_prepare_midgard_rsd(shader_info, rsd); + pan_shader_prepare_midgard_rsd(shader_info, rsd); #endif } #endif /* PAN_ARCH */ diff --git a/src/panfrost/lib/pan_texture.c b/src/panfrost/lib/pan_texture.c index 36e8039e118..19c52c98760 100644 --- a/src/panfrost/lib/pan_texture.c +++ b/src/panfrost/lib/pan_texture.c @@ -25,9 +25,9 @@ * */ +#include "pan_texture.h" #include "util/macros.h" #include "util/u_math.h" -#include "pan_texture.h" #if PAN_ARCH >= 5 /* @@ -38,27 +38,39 @@ static inline enum mali_astc_2d_dimension panfrost_astc_dim_2d(unsigned dim) { - switch (dim) { - case 4: return MALI_ASTC_2D_DIMENSION_4; - case 5: return MALI_ASTC_2D_DIMENSION_5; - case 6: return MALI_ASTC_2D_DIMENSION_6; - case 8: return MALI_ASTC_2D_DIMENSION_8; - case 10: return MALI_ASTC_2D_DIMENSION_10; - case 12: return MALI_ASTC_2D_DIMENSION_12; - default: unreachable("Invalid ASTC dimension"); - } + switch (dim) { + case 4: + return MALI_ASTC_2D_DIMENSION_4; + case 5: + return MALI_ASTC_2D_DIMENSION_5; + case 6: + return MALI_ASTC_2D_DIMENSION_6; + case 8: + return MALI_ASTC_2D_DIMENSION_8; + case 10: + return MALI_ASTC_2D_DIMENSION_10; + case 12: + return MALI_ASTC_2D_DIMENSION_12; + default: + unreachable("Invalid ASTC dimension"); + } } static inline enum mali_astc_3d_dimension panfrost_astc_dim_3d(unsigned dim) { - switch (dim) { - case 3: return MALI_ASTC_3D_DIMENSION_3; - case 4: return MALI_ASTC_3D_DIMENSION_4; - case 5: return MALI_ASTC_3D_DIMENSION_5; - case 6: return MALI_ASTC_3D_DIMENSION_6; - default: unreachable("Invalid ASTC dimension"); - } + switch (dim) { + case 3: + return MALI_ASTC_3D_DIMENSION_3; + case 4: + return MALI_ASTC_3D_DIMENSION_4; + case 5: + return MALI_ASTC_3D_DIMENSION_5; + case 6: + return MALI_ASTC_3D_DIMENSION_6; + default: + unreachable("Invalid ASTC dimension"); + } } #endif @@ -69,51 +81,50 @@ panfrost_astc_dim_3d(unsigned dim) static unsigned panfrost_compression_tag(const struct util_format_description *desc, - enum mali_texture_dimension dim, - uint64_t modifier) + enum mali_texture_dimension dim, uint64_t modifier) { #if PAN_ARCH >= 5 && PAN_ARCH <= 8 - if (drm_is_afbc(modifier)) { - unsigned flags = (modifier & AFBC_FORMAT_MOD_YTR) ? - MALI_AFBC_SURFACE_FLAG_YTR : 0; + if (drm_is_afbc(modifier)) { + unsigned flags = + (modifier & AFBC_FORMAT_MOD_YTR) ? MALI_AFBC_SURFACE_FLAG_YTR : 0; #if PAN_ARCH >= 6 - /* Prefetch enable */ - flags |= MALI_AFBC_SURFACE_FLAG_PREFETCH; + /* Prefetch enable */ + flags |= MALI_AFBC_SURFACE_FLAG_PREFETCH; - if (panfrost_afbc_is_wide(modifier)) - flags |= MALI_AFBC_SURFACE_FLAG_WIDE_BLOCK; + if (panfrost_afbc_is_wide(modifier)) + flags |= MALI_AFBC_SURFACE_FLAG_WIDE_BLOCK; #endif #if PAN_ARCH >= 7 - /* Tiled headers */ - if (modifier & AFBC_FORMAT_MOD_TILED) - flags |= MALI_AFBC_SURFACE_FLAG_TILED_HEADER; + /* Tiled headers */ + if (modifier & AFBC_FORMAT_MOD_TILED) + flags |= MALI_AFBC_SURFACE_FLAG_TILED_HEADER; - /* Used to make sure AFBC headers don't point outside the AFBC - * body. HW is using the AFBC surface stride to do this check, - * which doesn't work for 3D textures because the surface - * stride does not cover the body. Only supported on v7+. - */ - if (dim != MALI_TEXTURE_DIMENSION_3D) - flags |= MALI_AFBC_SURFACE_FLAG_CHECK_PAYLOAD_RANGE; + /* Used to make sure AFBC headers don't point outside the AFBC + * body. HW is using the AFBC surface stride to do this check, + * which doesn't work for 3D textures because the surface + * stride does not cover the body. Only supported on v7+. + */ + if (dim != MALI_TEXTURE_DIMENSION_3D) + flags |= MALI_AFBC_SURFACE_FLAG_CHECK_PAYLOAD_RANGE; #endif - return flags; - } else if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC) { - if (desc->block.depth > 1) { - return (panfrost_astc_dim_3d(desc->block.depth) << 4) | - (panfrost_astc_dim_3d(desc->block.height) << 2) | - panfrost_astc_dim_3d(desc->block.width); - } else { - return (panfrost_astc_dim_2d(desc->block.height) << 3) | - panfrost_astc_dim_2d(desc->block.width); - } - } + return flags; + } else if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC) { + if (desc->block.depth > 1) { + return (panfrost_astc_dim_3d(desc->block.depth) << 4) | + (panfrost_astc_dim_3d(desc->block.height) << 2) | + panfrost_astc_dim_3d(desc->block.width); + } else { + return (panfrost_astc_dim_2d(desc->block.height) << 3) | + panfrost_astc_dim_2d(desc->block.width); + } + } #endif - /* Tags are not otherwise used */ - return 0; + /* Tags are not otherwise used */ + return 0; } /* Cubemaps have 6 faces as "layers" in between each actual layer. We @@ -121,38 +132,37 @@ panfrost_compression_tag(const struct util_format_description *desc, * can they happen, perhaps from cubemap arrays? */ static void -panfrost_adjust_cube_dimensions( - unsigned *first_face, unsigned *last_face, - unsigned *first_layer, unsigned *last_layer) +panfrost_adjust_cube_dimensions(unsigned *first_face, unsigned *last_face, + unsigned *first_layer, unsigned *last_layer) { - *first_face = *first_layer % 6; - *last_face = *last_layer % 6; - *first_layer /= 6; - *last_layer /= 6; + *first_face = *first_layer % 6; + *last_face = *last_layer % 6; + *first_layer /= 6; + *last_layer /= 6; - assert((*first_layer == *last_layer) || (*first_face == 0 && *last_face == 5)); + assert((*first_layer == *last_layer) || + (*first_face == 0 && *last_face == 5)); } /* Following the texture descriptor is a number of descriptors. How many? */ static unsigned -panfrost_texture_num_elements( - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned nr_samples, bool is_cube) +panfrost_texture_num_elements(unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned nr_samples, bool is_cube) { - unsigned first_face = 0, last_face = 0; + unsigned first_face = 0, last_face = 0; - if (is_cube) { - panfrost_adjust_cube_dimensions(&first_face, &last_face, - &first_layer, &last_layer); - } + if (is_cube) { + panfrost_adjust_cube_dimensions(&first_face, &last_face, &first_layer, + &last_layer); + } - unsigned levels = 1 + last_level - first_level; - unsigned layers = 1 + last_layer - first_layer; - unsigned faces = 1 + last_face - first_face; + unsigned levels = 1 + last_level - first_level; + unsigned layers = 1 + last_layer - first_layer; + unsigned faces = 1 + last_face - first_face; - return levels * layers * faces * MAX2(nr_samples, 1); + return levels * layers * faces * MAX2(nr_samples, 1); } /* Conservative estimate of the size of the texture payload a priori. @@ -165,26 +175,25 @@ unsigned GENX(panfrost_estimate_texture_payload_size)(const struct pan_image_view *iview) { #if PAN_ARCH >= 9 - size_t element_size = pan_size(PLANE); + size_t element_size = pan_size(PLANE); #else - /* Assume worst case. Overestimates on Midgard, but that's ok. */ - size_t element_size = pan_size(SURFACE_WITH_STRIDE); + /* Assume worst case. Overestimates on Midgard, but that's ok. */ + size_t element_size = pan_size(SURFACE_WITH_STRIDE); #endif - unsigned elements = - panfrost_texture_num_elements(iview->first_level, iview->last_level, - iview->first_layer, iview->last_layer, - iview->image->layout.nr_samples, - iview->dim == MALI_TEXTURE_DIMENSION_CUBE); + unsigned elements = panfrost_texture_num_elements( + iview->first_level, iview->last_level, iview->first_layer, + iview->last_layer, iview->image->layout.nr_samples, + iview->dim == MALI_TEXTURE_DIMENSION_CUBE); - return element_size * elements; + return element_size * elements; } struct panfrost_surface_iter { - unsigned layer, last_layer; - unsigned level, first_level, last_level; - unsigned face, first_face, last_face; - unsigned sample, first_sample, last_sample; + unsigned layer, last_layer; + unsigned level, first_level, last_level; + unsigned face, first_face, last_face; + unsigned sample, first_sample, last_sample; }; static void @@ -194,83 +203,81 @@ panfrost_surface_iter_begin(struct panfrost_surface_iter *iter, unsigned first_face, unsigned last_face, unsigned nr_samples) { - iter->layer = first_layer; - iter->last_layer = last_layer; - iter->level = iter->first_level = first_level; - iter->last_level = last_level; - iter->face = iter->first_face = first_face; - iter->last_face = last_face; - iter->sample = iter->first_sample = 0; - iter->last_sample = nr_samples - 1; + iter->layer = first_layer; + iter->last_layer = last_layer; + iter->level = iter->first_level = first_level; + iter->last_level = last_level; + iter->face = iter->first_face = first_face; + iter->last_face = last_face; + iter->sample = iter->first_sample = 0; + iter->last_sample = nr_samples - 1; } static bool panfrost_surface_iter_end(const struct panfrost_surface_iter *iter) { - return iter->layer > iter->last_layer; + return iter->layer > iter->last_layer; } static void panfrost_surface_iter_next(struct panfrost_surface_iter *iter) { -#define INC_TEST(field) \ - do { \ - if (iter->field++ < iter->last_ ## field) \ - return; \ - iter->field = iter->first_ ## field; \ - } while (0) +#define INC_TEST(field) \ + do { \ + if (iter->field++ < iter->last_##field) \ + return; \ + iter->field = iter->first_##field; \ + } while (0) - /* Ordering is different on v7: inner loop is iterating on levels */ - if (PAN_ARCH >= 7) - INC_TEST(level); + /* Ordering is different on v7: inner loop is iterating on levels */ + if (PAN_ARCH >= 7) + INC_TEST(level); - INC_TEST(sample); - INC_TEST(face); + INC_TEST(sample); + INC_TEST(face); - if (PAN_ARCH < 7) - INC_TEST(level); + if (PAN_ARCH < 7) + INC_TEST(level); - iter->layer++; + iter->layer++; #undef INC_TEST } static void -panfrost_get_surface_strides(const struct pan_image_layout *layout, - unsigned l, +panfrost_get_surface_strides(const struct pan_image_layout *layout, unsigned l, int32_t *row_stride, int32_t *surf_stride) { - const struct pan_image_slice_layout *slice = &layout->slices[l]; + const struct pan_image_slice_layout *slice = &layout->slices[l]; - if (drm_is_afbc(layout->modifier)) { - /* Pre v7 don't have a row stride field. This field is - * repurposed as a Y offset which we don't use */ - *row_stride = PAN_ARCH < 7 ? 0 : slice->row_stride; - *surf_stride = slice->afbc.surface_stride; - } else { - *row_stride = slice->row_stride; - *surf_stride = slice->surface_stride; - } + if (drm_is_afbc(layout->modifier)) { + /* Pre v7 don't have a row stride field. This field is + * repurposed as a Y offset which we don't use */ + *row_stride = PAN_ARCH < 7 ? 0 : slice->row_stride; + *surf_stride = slice->afbc.surface_stride; + } else { + *row_stride = slice->row_stride; + *surf_stride = slice->surface_stride; + } } static mali_ptr panfrost_get_surface_pointer(const struct pan_image_layout *layout, - enum mali_texture_dimension dim, - mali_ptr base, + enum mali_texture_dimension dim, mali_ptr base, unsigned l, unsigned w, unsigned f, unsigned s) { - unsigned face_mult = dim == MALI_TEXTURE_DIMENSION_CUBE ? 6 : 1; - unsigned offset; + unsigned face_mult = dim == MALI_TEXTURE_DIMENSION_CUBE ? 6 : 1; + unsigned offset; - if (layout->dim == MALI_TEXTURE_DIMENSION_3D) { - assert(!f && !s); - offset = layout->slices[l].offset + - (w * panfrost_get_layer_stride(layout, l)); - } else { - offset = panfrost_texture_offset(layout, l, (w * face_mult) + f, s); - } + if (layout->dim == MALI_TEXTURE_DIMENSION_3D) { + assert(!f && !s); + offset = + layout->slices[l].offset + (w * panfrost_get_layer_stride(layout, l)); + } else { + offset = panfrost_texture_offset(layout, l, (w * face_mult) + f, s); + } - return base + offset; + return base + offset; } #if PAN_ARCH >= 9 @@ -323,185 +330,191 @@ static enum mali_clump_format special_clump_formats[PIPE_FORMAT_COUNT] = { static enum mali_clump_format panfrost_clump_format(enum pipe_format format) { - /* First, try a special clump format. Note that the 0 encoding is for a - * raw clump format, which will never be in the special table. - */ - if (special_clump_formats[format]) - return special_clump_formats[format]; + /* First, try a special clump format. Note that the 0 encoding is for a + * raw clump format, which will never be in the special table. + */ + if (special_clump_formats[format]) + return special_clump_formats[format]; - /* Else, it's a raw format. Raw formats must not be compressed. */ - assert(!util_format_is_compressed(format)); + /* Else, it's a raw format. Raw formats must not be compressed. */ + assert(!util_format_is_compressed(format)); - /* Select the appropriate raw format. */ - switch (util_format_get_blocksize(format)) { - case 1: return MALI_CLUMP_FORMAT_RAW8; - case 2: return MALI_CLUMP_FORMAT_RAW16; - case 3: return MALI_CLUMP_FORMAT_RAW24; - case 4: return MALI_CLUMP_FORMAT_RAW32; - case 6: return MALI_CLUMP_FORMAT_RAW48; - case 8: return MALI_CLUMP_FORMAT_RAW64; - case 12: return MALI_CLUMP_FORMAT_RAW96; - case 16: return MALI_CLUMP_FORMAT_RAW128; - default: unreachable("Invalid bpp"); - } + /* Select the appropriate raw format. */ + switch (util_format_get_blocksize(format)) { + case 1: + return MALI_CLUMP_FORMAT_RAW8; + case 2: + return MALI_CLUMP_FORMAT_RAW16; + case 3: + return MALI_CLUMP_FORMAT_RAW24; + case 4: + return MALI_CLUMP_FORMAT_RAW32; + case 6: + return MALI_CLUMP_FORMAT_RAW48; + case 8: + return MALI_CLUMP_FORMAT_RAW64; + case 12: + return MALI_CLUMP_FORMAT_RAW96; + case 16: + return MALI_CLUMP_FORMAT_RAW128; + default: + unreachable("Invalid bpp"); + } } static enum mali_afbc_superblock_size translate_superblock_size(uint64_t modifier) { - assert(drm_is_afbc(modifier)); + assert(drm_is_afbc(modifier)); - switch (modifier & AFBC_FORMAT_MOD_BLOCK_SIZE_MASK) { - case AFBC_FORMAT_MOD_BLOCK_SIZE_16x16: - return MALI_AFBC_SUPERBLOCK_SIZE_16X16; - case AFBC_FORMAT_MOD_BLOCK_SIZE_32x8: - return MALI_AFBC_SUPERBLOCK_SIZE_32X8; - case AFBC_FORMAT_MOD_BLOCK_SIZE_64x4: - return MALI_AFBC_SUPERBLOCK_SIZE_64X4; - default: - unreachable("Invalid superblock size"); - } + switch (modifier & AFBC_FORMAT_MOD_BLOCK_SIZE_MASK) { + case AFBC_FORMAT_MOD_BLOCK_SIZE_16x16: + return MALI_AFBC_SUPERBLOCK_SIZE_16X16; + case AFBC_FORMAT_MOD_BLOCK_SIZE_32x8: + return MALI_AFBC_SUPERBLOCK_SIZE_32X8; + case AFBC_FORMAT_MOD_BLOCK_SIZE_64x4: + return MALI_AFBC_SUPERBLOCK_SIZE_64X4; + default: + unreachable("Invalid superblock size"); + } } static void panfrost_emit_plane(const struct pan_image_layout *layout, - enum pipe_format format, - mali_ptr pointer, - unsigned level, + enum pipe_format format, mali_ptr pointer, unsigned level, void *payload) { - const struct util_format_description *desc = - util_format_description(layout->format); + const struct util_format_description *desc = + util_format_description(layout->format); - int32_t row_stride, surface_stride; + int32_t row_stride, surface_stride; - panfrost_get_surface_strides(layout, level, &row_stride, &surface_stride); - assert(row_stride >= 0 && surface_stride >= 0 && "negative stride"); + panfrost_get_surface_strides(layout, level, &row_stride, &surface_stride); + assert(row_stride >= 0 && surface_stride >= 0 && "negative stride"); - bool afbc = drm_is_afbc(layout->modifier); + bool afbc = drm_is_afbc(layout->modifier); - pan_pack(payload, PLANE, cfg) { - cfg.pointer = pointer; - cfg.row_stride = row_stride; - cfg.size = layout->data_size - layout->slices[level].offset; + pan_pack(payload, PLANE, cfg) { + cfg.pointer = pointer; + cfg.row_stride = row_stride; + cfg.size = layout->data_size - layout->slices[level].offset; - cfg.slice_stride = layout->nr_samples ? - layout->slices[level].surface_stride : - panfrost_get_layer_stride(layout, level); + cfg.slice_stride = layout->nr_samples + ? layout->slices[level].surface_stride + : panfrost_get_layer_stride(layout, level); - if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC) { - assert(!afbc); + if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC) { + assert(!afbc); - if (desc->block.depth > 1) { - cfg.plane_type = MALI_PLANE_TYPE_ASTC_3D; - cfg.astc._3d.block_width = panfrost_astc_dim_3d(desc->block.width); - cfg.astc._3d.block_height = panfrost_astc_dim_3d(desc->block.height); - cfg.astc._3d.block_depth = panfrost_astc_dim_3d(desc->block.depth); - } else { - cfg.plane_type = MALI_PLANE_TYPE_ASTC_2D; - cfg.astc._2d.block_width = panfrost_astc_dim_2d(desc->block.width); - cfg.astc._2d.block_height = panfrost_astc_dim_2d(desc->block.height); - } + if (desc->block.depth > 1) { + cfg.plane_type = MALI_PLANE_TYPE_ASTC_3D; + cfg.astc._3d.block_width = panfrost_astc_dim_3d(desc->block.width); + cfg.astc._3d.block_height = + panfrost_astc_dim_3d(desc->block.height); + cfg.astc._3d.block_depth = panfrost_astc_dim_3d(desc->block.depth); + } else { + cfg.plane_type = MALI_PLANE_TYPE_ASTC_2D; + cfg.astc._2d.block_width = panfrost_astc_dim_2d(desc->block.width); + cfg.astc._2d.block_height = + panfrost_astc_dim_2d(desc->block.height); + } - bool srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); + bool srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); - /* Mesa does not advertise _HDR formats yet */ - cfg.astc.decode_hdr = false; + /* Mesa does not advertise _HDR formats yet */ + cfg.astc.decode_hdr = false; - /* sRGB formats decode to RGBA8 sRGB, which is narrow. - * - * Non-sRGB formats decode to RGBA16F which is wide. - * With a future extension, we could decode non-sRGB - * formats narrowly too, but this isn't wired up in Mesa - * yet. - */ - cfg.astc.decode_wide = !srgb; - } else if (afbc) { - cfg.plane_type = MALI_PLANE_TYPE_AFBC; - cfg.afbc.superblock_size = translate_superblock_size(layout->modifier); - cfg.afbc.ytr = (layout->modifier & AFBC_FORMAT_MOD_YTR); - cfg.afbc.tiled_header = (layout->modifier & AFBC_FORMAT_MOD_TILED); - cfg.afbc.prefetch = true; - cfg.afbc.compression_mode = pan_afbc_compression_mode(format); - cfg.afbc.header_stride = layout->slices[level].afbc.header_size; - } else { - cfg.plane_type = MALI_PLANE_TYPE_GENERIC; - cfg.clump_format = panfrost_clump_format(format); - } + /* sRGB formats decode to RGBA8 sRGB, which is narrow. + * + * Non-sRGB formats decode to RGBA16F which is wide. + * With a future extension, we could decode non-sRGB + * formats narrowly too, but this isn't wired up in Mesa + * yet. + */ + cfg.astc.decode_wide = !srgb; + } else if (afbc) { + cfg.plane_type = MALI_PLANE_TYPE_AFBC; + cfg.afbc.superblock_size = translate_superblock_size(layout->modifier); + cfg.afbc.ytr = (layout->modifier & AFBC_FORMAT_MOD_YTR); + cfg.afbc.tiled_header = (layout->modifier & AFBC_FORMAT_MOD_TILED); + cfg.afbc.prefetch = true; + cfg.afbc.compression_mode = pan_afbc_compression_mode(format); + cfg.afbc.header_stride = layout->slices[level].afbc.header_size; + } else { + cfg.plane_type = MALI_PLANE_TYPE_GENERIC; + cfg.clump_format = panfrost_clump_format(format); + } - if (!afbc && layout->modifier == DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED) - cfg.clump_ordering = MALI_CLUMP_ORDERING_TILED_U_INTERLEAVED; - else if (!afbc) - cfg.clump_ordering = MALI_CLUMP_ORDERING_LINEAR; - } + if (!afbc && + layout->modifier == DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED) + cfg.clump_ordering = MALI_CLUMP_ORDERING_TILED_U_INTERLEAVED; + else if (!afbc) + cfg.clump_ordering = MALI_CLUMP_ORDERING_LINEAR; + } } #endif static void panfrost_emit_texture_payload(const struct pan_image_view *iview, - enum pipe_format format, - void *payload) + enum pipe_format format, void *payload) { - const struct pan_image_layout *layout = &iview->image->layout; - ASSERTED const struct util_format_description *desc = - util_format_description(format); + const struct pan_image_layout *layout = &iview->image->layout; + ASSERTED const struct util_format_description *desc = + util_format_description(format); - mali_ptr base = iview->image->data.bo->ptr.gpu + iview->image->data.offset; + mali_ptr base = iview->image->data.bo->ptr.gpu + iview->image->data.offset; - if (iview->buf.size) { - assert (iview->dim == MALI_TEXTURE_DIMENSION_1D); - base += iview->buf.offset; - } + if (iview->buf.size) { + assert(iview->dim == MALI_TEXTURE_DIMENSION_1D); + base += iview->buf.offset; + } - /* panfrost_compression_tag() wants the dimension of the resource, not the - * one of the image view (those might differ). - */ - base |= panfrost_compression_tag(desc, layout->dim, layout->modifier); + /* panfrost_compression_tag() wants the dimension of the resource, not the + * one of the image view (those might differ). + */ + base |= panfrost_compression_tag(desc, layout->dim, layout->modifier); - /* v4 does not support compression */ - assert(PAN_ARCH >= 5 || !drm_is_afbc(layout->modifier)); - assert(PAN_ARCH >= 5 || desc->layout != UTIL_FORMAT_LAYOUT_ASTC); + /* v4 does not support compression */ + assert(PAN_ARCH >= 5 || !drm_is_afbc(layout->modifier)); + assert(PAN_ARCH >= 5 || desc->layout != UTIL_FORMAT_LAYOUT_ASTC); - /* Inject the addresses in, interleaving array indices, mip levels, - * cube faces, and strides in that order. On Bifrost and older, each - * sample had its own surface descriptor; on Valhall, they are fused - * into a single plane descriptor. - */ + /* Inject the addresses in, interleaving array indices, mip levels, + * cube faces, and strides in that order. On Bifrost and older, each + * sample had its own surface descriptor; on Valhall, they are fused + * into a single plane descriptor. + */ - unsigned first_layer = iview->first_layer, last_layer = iview->last_layer; - unsigned nr_samples = PAN_ARCH <= 7 ? layout->nr_samples : 1; - unsigned first_face = 0, last_face = 0; + unsigned first_layer = iview->first_layer, last_layer = iview->last_layer; + unsigned nr_samples = PAN_ARCH <= 7 ? layout->nr_samples : 1; + unsigned first_face = 0, last_face = 0; - if (iview->dim == MALI_TEXTURE_DIMENSION_CUBE) { - panfrost_adjust_cube_dimensions(&first_face, &last_face, - &first_layer, &last_layer); - } + if (iview->dim == MALI_TEXTURE_DIMENSION_CUBE) { + panfrost_adjust_cube_dimensions(&first_face, &last_face, &first_layer, + &last_layer); + } - struct panfrost_surface_iter iter; + struct panfrost_surface_iter iter; - for (panfrost_surface_iter_begin(&iter, first_layer, last_layer, - iview->first_level, iview->last_level, - first_face, last_face, nr_samples); - !panfrost_surface_iter_end(&iter); - panfrost_surface_iter_next(&iter)) { - mali_ptr pointer = - panfrost_get_surface_pointer(layout, iview->dim, base, - iter.level, iter.layer, - iter.face, iter.sample); + for (panfrost_surface_iter_begin(&iter, first_layer, last_layer, + iview->first_level, iview->last_level, + first_face, last_face, nr_samples); + !panfrost_surface_iter_end(&iter); panfrost_surface_iter_next(&iter)) { + mali_ptr pointer = + panfrost_get_surface_pointer(layout, iview->dim, base, iter.level, + iter.layer, iter.face, iter.sample); #if PAN_ARCH >= 9 - panfrost_emit_plane(layout, format, pointer, iter.level, payload); - payload += pan_size(PLANE); + panfrost_emit_plane(layout, format, pointer, iter.level, payload); + payload += pan_size(PLANE); #else - pan_pack(payload, SURFACE_WITH_STRIDE, cfg) { - cfg.pointer = pointer; - panfrost_get_surface_strides(layout, iter.level, - &cfg.row_stride, - &cfg.surface_stride); - } - payload += pan_size(SURFACE_WITH_STRIDE); + pan_pack(payload, SURFACE_WITH_STRIDE, cfg) { + cfg.pointer = pointer; + panfrost_get_surface_strides(layout, iter.level, &cfg.row_stride, + &cfg.surface_stride); + } + payload += pan_size(SURFACE_WITH_STRIDE); #endif - } + } } #if PAN_ARCH <= 7 @@ -510,14 +523,14 @@ panfrost_emit_texture_payload(const struct pan_image_view *iview, static enum mali_texture_layout panfrost_modifier_to_layout(uint64_t modifier) { - if (drm_is_afbc(modifier)) - return MALI_TEXTURE_LAYOUT_AFBC; - else if (modifier == DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED) - return MALI_TEXTURE_LAYOUT_TILED; - else if (modifier == DRM_FORMAT_MOD_LINEAR) - return MALI_TEXTURE_LAYOUT_LINEAR; - else - unreachable("Invalid modifer"); + if (drm_is_afbc(modifier)) + return MALI_TEXTURE_LAYOUT_AFBC; + else if (modifier == DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED) + return MALI_TEXTURE_LAYOUT_TILED; + else if (modifier == DRM_FORMAT_MOD_LINEAR) + return MALI_TEXTURE_LAYOUT_LINEAR; + else + unreachable("Invalid modifer"); } #endif @@ -532,103 +545,99 @@ panfrost_modifier_to_layout(uint64_t modifier) */ void GENX(panfrost_new_texture)(const struct panfrost_device *dev, - const struct pan_image_view *iview, - void *out, const struct panfrost_ptr *payload) + const struct pan_image_view *iview, void *out, + const struct panfrost_ptr *payload) { - const struct pan_image_layout *layout = &iview->image->layout; - enum pipe_format format = iview->format; - uint32_t mali_format = dev->formats[format].hw; - unsigned char swizzle[4]; + const struct pan_image_layout *layout = &iview->image->layout; + enum pipe_format format = iview->format; + uint32_t mali_format = dev->formats[format].hw; + unsigned char swizzle[4]; - if (PAN_ARCH >= 7 && util_format_is_depth_or_stencil(format)) { - /* v7+ doesn't have an _RRRR component order, combine the - * user swizzle with a .XXXX swizzle to emulate that. - */ - static const unsigned char replicate_x[4] = { - PIPE_SWIZZLE_X, PIPE_SWIZZLE_X, - PIPE_SWIZZLE_X, PIPE_SWIZZLE_X, - }; + if (PAN_ARCH >= 7 && util_format_is_depth_or_stencil(format)) { + /* v7+ doesn't have an _RRRR component order, combine the + * user swizzle with a .XXXX swizzle to emulate that. + */ + static const unsigned char replicate_x[4] = { + PIPE_SWIZZLE_X, + PIPE_SWIZZLE_X, + PIPE_SWIZZLE_X, + PIPE_SWIZZLE_X, + }; - util_format_compose_swizzles(replicate_x, - iview->swizzle, - swizzle); - } else if (PAN_ARCH == 7) { + util_format_compose_swizzles(replicate_x, iview->swizzle, swizzle); + } else if (PAN_ARCH == 7) { #if PAN_ARCH == 7 - /* v7 (only) restricts component orders when AFBC is in use. - * Rather than restrict AFBC, we use an allowed component order - * with an invertible swizzle composed. - */ - enum mali_rgb_component_order orig = - mali_format & BITFIELD_MASK(12); - struct pan_decomposed_swizzle decomposed = - GENX(pan_decompose_swizzle)(orig); + /* v7 (only) restricts component orders when AFBC is in use. + * Rather than restrict AFBC, we use an allowed component order + * with an invertible swizzle composed. + */ + enum mali_rgb_component_order orig = mali_format & BITFIELD_MASK(12); + struct pan_decomposed_swizzle decomposed = + GENX(pan_decompose_swizzle)(orig); - /* Apply the new component order */ - mali_format = (mali_format & ~orig) | decomposed.pre; + /* Apply the new component order */ + mali_format = (mali_format & ~orig) | decomposed.pre; - /* Compose the new swizzle */ - util_format_compose_swizzles(decomposed.post, iview->swizzle, - swizzle); + /* Compose the new swizzle */ + util_format_compose_swizzles(decomposed.post, iview->swizzle, swizzle); #endif - } else { - STATIC_ASSERT(sizeof(swizzle) == sizeof(iview->swizzle)); - memcpy(swizzle, iview->swizzle, sizeof(swizzle)); - } + } else { + STATIC_ASSERT(sizeof(swizzle) == sizeof(iview->swizzle)); + memcpy(swizzle, iview->swizzle, sizeof(swizzle)); + } - panfrost_emit_texture_payload(iview, format, payload->cpu); + panfrost_emit_texture_payload(iview, format, payload->cpu); - unsigned array_size = iview->last_layer - iview->first_layer + 1; + unsigned array_size = iview->last_layer - iview->first_layer + 1; - if (iview->dim == MALI_TEXTURE_DIMENSION_CUBE) { - assert(iview->first_layer % 6 == 0); - assert(iview->last_layer % 6 == 5); - array_size /= 6; - } + if (iview->dim == MALI_TEXTURE_DIMENSION_CUBE) { + assert(iview->first_layer % 6 == 0); + assert(iview->last_layer % 6 == 5); + array_size /= 6; + } - unsigned width; + unsigned width; - if (iview->buf.size) { - assert(iview->dim == MALI_TEXTURE_DIMENSION_1D); - assert(!iview->first_level && !iview->last_level); - assert(!iview->first_layer && !iview->last_layer); - assert(layout->nr_samples == 1); - assert(layout->height == 1 && layout->depth == 1); - assert(iview->buf.offset + iview->buf.size <= layout->width); - width = iview->buf.size; - } else { - width = u_minify(layout->width, iview->first_level); - } + if (iview->buf.size) { + assert(iview->dim == MALI_TEXTURE_DIMENSION_1D); + assert(!iview->first_level && !iview->last_level); + assert(!iview->first_layer && !iview->last_layer); + assert(layout->nr_samples == 1); + assert(layout->height == 1 && layout->depth == 1); + assert(iview->buf.offset + iview->buf.size <= layout->width); + width = iview->buf.size; + } else { + width = u_minify(layout->width, iview->first_level); + } - pan_pack(out, TEXTURE, cfg) { - cfg.dimension = iview->dim; - cfg.format = mali_format; - cfg.width = width; - cfg.height = u_minify(layout->height, iview->first_level); - if (iview->dim == MALI_TEXTURE_DIMENSION_3D) - cfg.depth = u_minify(layout->depth, iview->first_level); - else - cfg.sample_count = layout->nr_samples; - cfg.swizzle = panfrost_translate_swizzle_4(swizzle); + pan_pack(out, TEXTURE, cfg) { + cfg.dimension = iview->dim; + cfg.format = mali_format; + cfg.width = width; + cfg.height = u_minify(layout->height, iview->first_level); + if (iview->dim == MALI_TEXTURE_DIMENSION_3D) + cfg.depth = u_minify(layout->depth, iview->first_level); + else + cfg.sample_count = layout->nr_samples; + cfg.swizzle = panfrost_translate_swizzle_4(swizzle); #if PAN_ARCH >= 9 - cfg.texel_interleave = - (layout->modifier != DRM_FORMAT_MOD_LINEAR) || - util_format_is_compressed(format); + cfg.texel_interleave = (layout->modifier != DRM_FORMAT_MOD_LINEAR) || + util_format_is_compressed(format); #else - cfg.texel_ordering = - panfrost_modifier_to_layout(layout->modifier); + cfg.texel_ordering = panfrost_modifier_to_layout(layout->modifier); #endif - cfg.levels = iview->last_level - iview->first_level + 1; - cfg.array_size = array_size; + cfg.levels = iview->last_level - iview->first_level + 1; + cfg.array_size = array_size; #if PAN_ARCH >= 6 - cfg.surfaces = payload->gpu; + cfg.surfaces = payload->gpu; - /* We specify API-level LOD clamps in the sampler descriptor - * and use these clamps simply for bounds checking */ - cfg.minimum_lod = FIXED_16(0, false); - cfg.maximum_lod = FIXED_16(cfg.levels - 1, false); + /* We specify API-level LOD clamps in the sampler descriptor + * and use these clamps simply for bounds checking */ + cfg.minimum_lod = FIXED_16(0, false); + cfg.maximum_lod = FIXED_16(cfg.levels - 1, false); #else - cfg.manual_stride = true; + cfg.manual_stride = true; #endif - } + } } diff --git a/src/panfrost/lib/pan_texture.h b/src/panfrost/lib/pan_texture.h index e6768ef80cf..a5b391e7afd 100644 --- a/src/panfrost/lib/pan_texture.h +++ b/src/panfrost/lib/pan_texture.h @@ -31,14 +31,14 @@ #include "genxml/gen_macros.h" #include -#include "drm-uapi/drm_fourcc.h" -#include "util/format/u_format.h" #include "compiler/shader_enums.h" +#include "drm-uapi/drm_fourcc.h" #include "genxml/gen_macros.h" +#include "util/format/u_format.h" #include "pan_bo.h" #include "pan_device.h" -#include "pan_util.h" #include "pan_format.h" +#include "pan_util.h" #ifdef __cplusplus extern "C" { @@ -48,104 +48,101 @@ extern "C" { extern uint64_t pan_best_modifiers[PAN_MODIFIER_COUNT]; struct pan_image_slice_layout { - unsigned offset; + unsigned offset; - /* For AFBC images, the number of bytes between two rows of AFBC - * headers. - * - * For non-AFBC images, the number of bytes between two rows of texels. - * For linear images, this will equal the logical stride. For - * images that are compressed or interleaved, this will be greater than - * the logical stride. - */ - unsigned row_stride; + /* For AFBC images, the number of bytes between two rows of AFBC + * headers. + * + * For non-AFBC images, the number of bytes between two rows of texels. + * For linear images, this will equal the logical stride. For + * images that are compressed or interleaved, this will be greater than + * the logical stride. + */ + unsigned row_stride; - unsigned surface_stride; + unsigned surface_stride; - struct { - /* Size of the AFBC header preceding each slice */ - unsigned header_size; + struct { + /* Size of the AFBC header preceding each slice */ + unsigned header_size; - /* Size of the AFBC body */ - unsigned body_size; + /* Size of the AFBC body */ + unsigned body_size; - /* Stride between AFBC headers of two consecutive surfaces. - * For 3D textures, this must be set to header size since - * AFBC headers are allocated together, for 2D arrays this - * should be set to size0, since AFBC headers are placed at - * the beginning of each layer - */ - unsigned surface_stride; - } afbc; + /* Stride between AFBC headers of two consecutive surfaces. + * For 3D textures, this must be set to header size since + * AFBC headers are allocated together, for 2D arrays this + * should be set to size0, since AFBC headers are placed at + * the beginning of each layer + */ + unsigned surface_stride; + } afbc; - /* If checksumming is enabled following the slice, what - * is its offset/stride? */ - struct { - unsigned offset; - unsigned stride; - unsigned size; - } crc; + /* If checksumming is enabled following the slice, what + * is its offset/stride? */ + struct { + unsigned offset; + unsigned stride; + unsigned size; + } crc; - unsigned size; + unsigned size; }; struct pan_image_layout { - uint64_t modifier; - enum pipe_format format; - unsigned width, height, depth; - unsigned nr_samples; - enum mali_texture_dimension dim; - unsigned nr_slices; - unsigned array_size; - bool crc; + uint64_t modifier; + enum pipe_format format; + unsigned width, height, depth; + unsigned nr_samples; + enum mali_texture_dimension dim; + unsigned nr_slices; + unsigned array_size; + bool crc; - /* The remaining fields may be derived from the above by calling - * pan_image_layout_init - */ + /* The remaining fields may be derived from the above by calling + * pan_image_layout_init + */ - struct pan_image_slice_layout slices[MAX_MIP_LEVELS]; + struct pan_image_slice_layout slices[MAX_MIP_LEVELS]; - unsigned data_size; - unsigned array_stride; + unsigned data_size; + unsigned array_stride; }; struct pan_image_mem { - struct panfrost_bo *bo; - unsigned offset; + struct panfrost_bo *bo; + unsigned offset; }; struct pan_image { - struct pan_image_mem data; - struct pan_image_layout layout; + struct pan_image_mem data; + struct pan_image_layout layout; }; struct pan_image_view { - /* Format, dimension and sample count of the view might differ from - * those of the image (2D view of a 3D image surface for instance). - */ - enum pipe_format format; - enum mali_texture_dimension dim; - unsigned first_level, last_level; - unsigned first_layer, last_layer; - unsigned char swizzle[4]; - const struct pan_image *image; + /* Format, dimension and sample count of the view might differ from + * those of the image (2D view of a 3D image surface for instance). + */ + enum pipe_format format; + enum mali_texture_dimension dim; + unsigned first_level, last_level; + unsigned first_layer, last_layer; + unsigned char swizzle[4]; + const struct pan_image *image; - /* If EXT_multisampled_render_to_texture is used, this may be - * greater than image->layout.nr_samples. */ - unsigned nr_samples; + /* If EXT_multisampled_render_to_texture is used, this may be + * greater than image->layout.nr_samples. */ + unsigned nr_samples; - /* Only valid if dim == 1D, needed to implement buffer views */ - struct { - unsigned offset; - unsigned size; - } buf; + /* Only valid if dim == 1D, needed to implement buffer views */ + struct { + unsigned offset; + unsigned size; + } buf; }; -unsigned -panfrost_compute_checksum_size( - struct pan_image_slice_layout *slice, - unsigned width, - unsigned height); +unsigned panfrost_compute_checksum_size(struct pan_image_slice_layout *slice, + unsigned width, unsigned height); /* AFBC format mode. The ordering is intended to match the Valhall hardware enum * ("AFBC Compression Mode"), but this enum is required in software on older @@ -153,46 +150,42 @@ panfrost_compute_checksum_size( * unify these code paths. */ enum pan_afbc_mode { - PAN_AFBC_MODE_R8, - PAN_AFBC_MODE_R8G8, - PAN_AFBC_MODE_R5G6B5, - PAN_AFBC_MODE_R4G4B4A4, - PAN_AFBC_MODE_R5G5B5A1, - PAN_AFBC_MODE_R8G8B8, - PAN_AFBC_MODE_R8G8B8A8, - PAN_AFBC_MODE_R10G10B10A2, - PAN_AFBC_MODE_R11G11B10, - PAN_AFBC_MODE_S8, + PAN_AFBC_MODE_R8, + PAN_AFBC_MODE_R8G8, + PAN_AFBC_MODE_R5G6B5, + PAN_AFBC_MODE_R4G4B4A4, + PAN_AFBC_MODE_R5G5B5A1, + PAN_AFBC_MODE_R8G8B8, + PAN_AFBC_MODE_R8G8B8A8, + PAN_AFBC_MODE_R10G10B10A2, + PAN_AFBC_MODE_R11G11B10, + PAN_AFBC_MODE_S8, - /* Sentintel signalling a format that cannot be compressed */ - PAN_AFBC_MODE_INVALID + /* Sentintel signalling a format that cannot be compressed */ + PAN_AFBC_MODE_INVALID }; -bool -panfrost_format_supports_afbc(const struct panfrost_device *dev, - enum pipe_format format); +bool panfrost_format_supports_afbc(const struct panfrost_device *dev, + enum pipe_format format); -enum pan_afbc_mode -panfrost_afbc_format(unsigned arch, enum pipe_format format); +enum pan_afbc_mode panfrost_afbc_format(unsigned arch, enum pipe_format format); #define AFBC_HEADER_BYTES_PER_TILE 16 -bool -panfrost_afbc_can_ytr(enum pipe_format format); +bool panfrost_afbc_can_ytr(enum pipe_format format); -bool -panfrost_afbc_can_tile(const struct panfrost_device *dev); +bool panfrost_afbc_can_tile(const struct panfrost_device *dev); /* * Represents the block size of a single plane. For AFBC, this represents the * superblock size. For u-interleaving, this represents the tile size. */ struct pan_block_size { - /** Width of block */ - unsigned width; + /** Width of block */ + unsigned width; - /** Height of blocks */ - unsigned height; + /** Height of blocks */ + unsigned height; }; struct pan_block_size panfrost_afbc_superblock_size(uint64_t modifier); @@ -207,71 +200,63 @@ uint32_t pan_afbc_row_stride(uint64_t modifier, uint32_t width); uint32_t pan_afbc_stride_blocks(uint64_t modifier, uint32_t row_stride_bytes); -struct pan_block_size -panfrost_block_size(uint64_t modifier, enum pipe_format format); +struct pan_block_size panfrost_block_size(uint64_t modifier, + enum pipe_format format); #ifdef PAN_ARCH -unsigned -GENX(panfrost_estimate_texture_payload_size)(const struct pan_image_view *iview); +unsigned GENX(panfrost_estimate_texture_payload_size)( + const struct pan_image_view *iview); -void -GENX(panfrost_new_texture)(const struct panfrost_device *dev, - const struct pan_image_view *iview, - void *out, - const struct panfrost_ptr *payload); +void GENX(panfrost_new_texture)(const struct panfrost_device *dev, + const struct pan_image_view *iview, void *out, + const struct panfrost_ptr *payload); #endif -unsigned -panfrost_get_layer_stride(const struct pan_image_layout *layout, - unsigned level); +unsigned panfrost_get_layer_stride(const struct pan_image_layout *layout, + unsigned level); -unsigned -panfrost_texture_offset(const struct pan_image_layout *layout, - unsigned level, unsigned array_idx, - unsigned surface_idx); +unsigned panfrost_texture_offset(const struct pan_image_layout *layout, + unsigned level, unsigned array_idx, + unsigned surface_idx); struct pan_pool; struct pan_scoreboard; /* DRM modifier helper */ -#define drm_is_afbc(mod) \ - ((mod >> 52) == (DRM_FORMAT_MOD_ARM_TYPE_AFBC | \ - (DRM_FORMAT_MOD_VENDOR_ARM << 4))) +#define drm_is_afbc(mod) \ + ((mod >> 52) == \ + (DRM_FORMAT_MOD_ARM_TYPE_AFBC | (DRM_FORMAT_MOD_VENDOR_ARM << 4))) struct pan_image_explicit_layout { - unsigned offset; - unsigned row_stride; + unsigned offset; + unsigned row_stride; }; bool pan_image_layout_init(struct pan_image_layout *layout, const struct pan_image_explicit_layout *explicit_layout); -unsigned -panfrost_get_legacy_stride(const struct pan_image_layout *layout, - unsigned level); +unsigned panfrost_get_legacy_stride(const struct pan_image_layout *layout, + unsigned level); -unsigned -panfrost_from_legacy_stride(unsigned legacy_stride, - enum pipe_format format, - uint64_t modifier); +unsigned panfrost_from_legacy_stride(unsigned legacy_stride, + enum pipe_format format, + uint64_t modifier); struct pan_surface { - union { - mali_ptr data; - struct { - mali_ptr header; - mali_ptr body; - } afbc; - }; + union { + mali_ptr data; + struct { + mali_ptr header; + mali_ptr body; + } afbc; + }; }; -void -pan_iview_get_surface(const struct pan_image_view *iview, - unsigned level, unsigned layer, unsigned sample, - struct pan_surface *surf); - +void pan_iview_get_surface(const struct pan_image_view *iview, unsigned level, + unsigned layer, unsigned sample, + struct pan_surface *surf); #if PAN_ARCH >= 9 enum mali_afbc_compression_mode diff --git a/src/panfrost/lib/pan_tiler.c b/src/panfrost/lib/pan_tiler.c index e8bce5e2a60..2e3126af251 100644 --- a/src/panfrost/lib/pan_tiler.c +++ b/src/panfrost/lib/pan_tiler.c @@ -24,8 +24,8 @@ * Alyssa Rosenzweig */ -#include "util/u_math.h" #include "util/macros.h" +#include "util/u_math.h" #include "pan_device.h" #include "pan_encoder.h" @@ -179,7 +179,7 @@ * tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1)) * * Hence we clamp up to align_pot(fb / (64 - 1)). - + * Extending to use a selection heuristic left for future work. * * Once the tile size (w, h) is chosen, we compute the hierarchy "mask": @@ -227,15 +227,16 @@ * a a fixed-tile size (not any of a number of power-of-twos) */ static unsigned -pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned tile_height) +pan_tile_count(unsigned width, unsigned height, unsigned tile_width, + unsigned tile_height) { - unsigned aligned_width = ALIGN_POT(width, tile_width); - unsigned aligned_height = ALIGN_POT(height, tile_height); + unsigned aligned_width = ALIGN_POT(width, tile_width); + unsigned aligned_height = ALIGN_POT(height, tile_height); - unsigned tile_count_x = aligned_width / tile_width; - unsigned tile_count_y = aligned_height / tile_height; + unsigned tile_count_x = aligned_width / tile_width; + unsigned tile_count_y = aligned_height / tile_height; - return tile_count_x * tile_count_y; + return tile_count_x * tile_count_y; } /* For `masked_count` of the smallest tile sizes masked out, computes how the @@ -246,32 +247,29 @@ pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned ti * levels to find a byte count for all levels. */ static unsigned -panfrost_hierarchy_size( - unsigned width, - unsigned height, - unsigned mask, - unsigned bytes_per_tile) +panfrost_hierarchy_size(unsigned width, unsigned height, unsigned mask, + unsigned bytes_per_tile) { - unsigned size = PROLOGUE_SIZE; + unsigned size = PROLOGUE_SIZE; - /* Iterate hierarchy levels */ + /* Iterate hierarchy levels */ - for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) { - /* Check if this level is enabled */ - if (!(mask & (1 << b))) - continue; + for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) { + /* Check if this level is enabled */ + if (!(mask & (1 << b))) + continue; - /* Shift from a level to a tile size */ - unsigned tile_size = (1 << b) * MIN_TILE_SIZE; + /* Shift from a level to a tile size */ + unsigned tile_size = (1 << b) * MIN_TILE_SIZE; - unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size); - unsigned level_count = bytes_per_tile * tile_count; + unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size); + unsigned level_count = bytes_per_tile * tile_count; - size += level_count; - } + size += level_count; + } - /* This size will be used as an offset, so ensure it's aligned */ - return ALIGN_POT(size, 0x200); + /* This size will be used as an offset, so ensure it's aligned */ + return ALIGN_POT(size, 0x200); } /* Implement the formula: @@ -284,29 +282,32 @@ panfrost_hierarchy_size( */ static unsigned -panfrost_flat_size(unsigned width, unsigned height, unsigned dim, unsigned bytes_per_tile) +panfrost_flat_size(unsigned width, unsigned height, unsigned dim, + unsigned bytes_per_tile) { - /* First, extract the tile dimensions */ + /* First, extract the tile dimensions */ - unsigned tw = (1 << (dim & 0b111)) * 8; - unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8; + unsigned tw = (1 << (dim & 0b111)) * 8; + unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8; - /* tile_count is ceil(W/w) * ceil(H/h) */ - unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile; + /* tile_count is ceil(W/w) * ceil(H/h) */ + unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile; - /* Round down and add offset */ - return 0x200 + ((raw / 0x200) * 0x200); + /* Round down and add offset */ + return 0x200 + ((raw / 0x200) * 0x200); } /* Given a hierarchy mask and a framebuffer size, compute the header size */ unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) +panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, + bool hierarchy) { - if (hierarchy) - return panfrost_hierarchy_size(width, height, mask, HEADER_BYTES_PER_TILE); - else - return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE); + if (hierarchy) + return panfrost_hierarchy_size(width, height, mask, + HEADER_BYTES_PER_TILE); + else + return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE); } /* The combined header/body is sized similarly (but it is significantly @@ -315,38 +316,39 @@ panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool */ unsigned -panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) +panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, + bool hierarchy) { - if (hierarchy) - return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE); - else - return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE); + if (hierarchy) + return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE); + else + return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE); } /* On GPUs without hierarchical tiling, we choose a tile size directly and * stuff it into the field otherwise known as hierarchy mask (not a mask). */ static unsigned -panfrost_choose_tile_size( - unsigned width, unsigned height, unsigned vertex_count) +panfrost_choose_tile_size(unsigned width, unsigned height, + unsigned vertex_count) { - /* Figure out the ideal tile size. Eventually a heuristic should be - * used for this */ + /* Figure out the ideal tile size. Eventually a heuristic should be + * used for this */ - unsigned best_w = 16; - unsigned best_h = 16; + unsigned best_w = 16; + unsigned best_h = 16; - /* Clamp so there are less than 64 tiles in each direction */ + /* Clamp so there are less than 64 tiles in each direction */ - best_w = MAX2(best_w, util_next_power_of_two(width / 63)); - best_h = MAX2(best_h, util_next_power_of_two(height / 63)); + best_w = MAX2(best_w, util_next_power_of_two(width / 63)); + best_h = MAX2(best_h, util_next_power_of_two(height / 63)); - /* We have our ideal tile size, so encode */ + /* We have our ideal tile size, so encode */ - unsigned exp_w = util_logbase2(best_w / 16); - unsigned exp_h = util_logbase2(best_h / 16); + unsigned exp_w = util_logbase2(best_w / 16); + unsigned exp_h = util_logbase2(best_h / 16); - return exp_w | (exp_h << 6); + return exp_w | (exp_h << 6); } /* In the future, a heuristic to choose a tiler hierarchy mask would go here. @@ -356,19 +358,18 @@ panfrost_choose_tile_size( * don't really need all the smaller levels enabled */ unsigned -panfrost_choose_hierarchy_mask( - unsigned width, unsigned height, - unsigned vertex_count, bool hierarchy) +panfrost_choose_hierarchy_mask(unsigned width, unsigned height, + unsigned vertex_count, bool hierarchy) { - /* If there is no geometry, we don't bother enabling anything */ + /* If there is no geometry, we don't bother enabling anything */ - if (!vertex_count) - return 0x00; + if (!vertex_count) + return 0x00; - if (!hierarchy) - return panfrost_choose_tile_size(width, height, vertex_count); + if (!hierarchy) + return panfrost_choose_tile_size(width, height, vertex_count); - /* Otherwise, default everything on. TODO: Proper tests */ + /* Otherwise, default everything on. TODO: Proper tests */ - return 0xFF; + return 0xFF; } diff --git a/src/panfrost/lib/pan_util.c b/src/panfrost/lib/pan_util.c index 4f56a828e68..4f43d56f6be 100644 --- a/src/panfrost/lib/pan_util.c +++ b/src/panfrost/lib/pan_util.c @@ -23,7 +23,7 @@ #include #include "pan_texture.h" - + /* Translate a PIPE swizzle quad to a 12-bit Mali swizzle code. PIPE * swizzles line up with Mali swizzles for the XYZW01, but PIPE swizzles have * an additional "NONE" field that we have to mask out to zero. Additionally, @@ -32,38 +32,39 @@ unsigned panfrost_translate_swizzle_4(const unsigned char swizzle[4]) { - unsigned out = 0; + unsigned out = 0; - for (unsigned i = 0; i < 4; ++i) { - unsigned translated = (swizzle[i] > PIPE_SWIZZLE_1) ? PIPE_SWIZZLE_0 : swizzle[i]; - out |= (translated << (3*i)); - } + for (unsigned i = 0; i < 4; ++i) { + unsigned translated = + (swizzle[i] > PIPE_SWIZZLE_1) ? PIPE_SWIZZLE_0 : swizzle[i]; + out |= (translated << (3 * i)); + } - return out; + return out; } void panfrost_invert_swizzle(const unsigned char *in, unsigned char *out) { - /* First, default to all zeroes to prevent uninitialized junk */ + /* First, default to all zeroes to prevent uninitialized junk */ - for (unsigned c = 0; c < 4; ++c) - out[c] = PIPE_SWIZZLE_0; + for (unsigned c = 0; c < 4; ++c) + out[c] = PIPE_SWIZZLE_0; - /* Now "do" what the swizzle says */ + /* Now "do" what the swizzle says */ - for (unsigned c = 0; c < 4; ++c) { - unsigned char i = in[c]; + for (unsigned c = 0; c < 4; ++c) { + unsigned char i = in[c]; - /* Who cares? */ - assert(PIPE_SWIZZLE_X == 0); - if (i > PIPE_SWIZZLE_W) - continue; + /* Who cares? */ + assert(PIPE_SWIZZLE_X == 0); + if (i > PIPE_SWIZZLE_W) + continue; - /* Invert */ - unsigned idx = i - PIPE_SWIZZLE_X; - out[idx] = PIPE_SWIZZLE_X + c; - } + /* Invert */ + unsigned idx = i - PIPE_SWIZZLE_X; + out[idx] = PIPE_SWIZZLE_X + c; + } } /* Formats requiring blend shaders are stored raw in the tilebuffer and will @@ -72,12 +73,12 @@ panfrost_invert_swizzle(const unsigned char *in, unsigned char *out) unsigned panfrost_format_to_bifrost_blend(const struct panfrost_device *dev, - enum pipe_format format, - bool dithered) + enum pipe_format format, bool dithered) { - mali_pixel_format pixfmt = (dev->arch >= 7) ? - panfrost_blendable_formats_v7[format].bifrost[dithered] : - panfrost_blendable_formats_v6[format].bifrost[dithered]; + mali_pixel_format pixfmt = + (dev->arch >= 7) + ? panfrost_blendable_formats_v7[format].bifrost[dithered] + : panfrost_blendable_formats_v6[format].bifrost[dithered]; - return pixfmt ?: dev->formats[format].hw; + return pixfmt ?: dev->formats[format].hw; } diff --git a/src/panfrost/lib/pan_util.h b/src/panfrost/lib/pan_util.h index c2f883737c3..87eccff7fbc 100644 --- a/src/panfrost/lib/pan_util.h +++ b/src/panfrost/lib/pan_util.h @@ -28,58 +28,54 @@ #ifndef PAN_UTIL_H #define PAN_UTIL_H -#include #include +#include #include "util/format/u_format.h" -#define PAN_DBG_PERF 0x0001 -#define PAN_DBG_TRACE 0x0002 -#define PAN_DBG_DEQP 0x0004 -#define PAN_DBG_DIRTY 0x0008 -#define PAN_DBG_SYNC 0x0010 +#define PAN_DBG_PERF 0x0001 +#define PAN_DBG_TRACE 0x0002 +#define PAN_DBG_DEQP 0x0004 +#define PAN_DBG_DIRTY 0x0008 +#define PAN_DBG_SYNC 0x0010 /* 0x20 unused */ -#define PAN_DBG_NOFP16 0x0040 -#define PAN_DBG_NO_CRC 0x0080 -#define PAN_DBG_GL3 0x0100 -#define PAN_DBG_NO_AFBC 0x0200 -#define PAN_DBG_MSAA16 0x0400 -#define PAN_DBG_INDIRECT 0x0800 -#define PAN_DBG_LINEAR 0x1000 -#define PAN_DBG_NO_CACHE 0x2000 -#define PAN_DBG_DUMP 0x4000 +#define PAN_DBG_NOFP16 0x0040 +#define PAN_DBG_NO_CRC 0x0080 +#define PAN_DBG_GL3 0x0100 +#define PAN_DBG_NO_AFBC 0x0200 +#define PAN_DBG_MSAA16 0x0400 +#define PAN_DBG_INDIRECT 0x0800 +#define PAN_DBG_LINEAR 0x1000 +#define PAN_DBG_NO_CACHE 0x2000 +#define PAN_DBG_DUMP 0x4000 #ifndef NDEBUG -#define PAN_DBG_OVERFLOW 0x8000 +#define PAN_DBG_OVERFLOW 0x8000 #endif struct panfrost_device; -unsigned -panfrost_translate_swizzle_4(const unsigned char swizzle[4]); +unsigned panfrost_translate_swizzle_4(const unsigned char swizzle[4]); -void -panfrost_invert_swizzle(const unsigned char *in, unsigned char *out); +void panfrost_invert_swizzle(const unsigned char *in, unsigned char *out); -unsigned -panfrost_format_to_bifrost_blend(const struct panfrost_device *dev, - enum pipe_format format, - bool dithered); +unsigned panfrost_format_to_bifrost_blend(const struct panfrost_device *dev, + enum pipe_format format, + bool dithered); -void -pan_pack_color(uint32_t *packed, const union pipe_color_union *color, - enum pipe_format format, bool dithered); +void pan_pack_color(uint32_t *packed, const union pipe_color_union *color, + enum pipe_format format, bool dithered); /* Get the last blend shader, for an erratum workaround on v5 */ static inline uint64_t panfrost_last_nonnull(uint64_t *ptrs, unsigned count) { - for (signed i = ((signed) count - 1); i >= 0; --i) { - if (ptrs[i]) - return ptrs[i]; - } + for (signed i = ((signed)count - 1); i >= 0; --i) { + if (ptrs[i]) + return ptrs[i]; + } - return 0; + return 0; } #endif /* PAN_UTIL_H */ diff --git a/src/panfrost/lib/tests/test-blend.c b/src/panfrost/lib/tests/test-blend.c index d04efd68fcb..2ba301e7b24 100644 --- a/src/panfrost/lib/tests/test-blend.c +++ b/src/panfrost/lib/tests/test-blend.c @@ -293,17 +293,19 @@ static const struct test blend_tests[] = { }; /* clang-format on */ -#define ASSERT_EQ(x, y) do { \ - if (x == y) { \ - nr_pass++; \ - } else { \ - nr_fail++; \ - fprintf(stderr, "%s: Assertion failed %s (%x) != %s (%x)\n", \ - T.label, #x, x, #y, y); \ - } \ -} while(0) +#define ASSERT_EQ(x, y) \ + do { \ + if (x == y) { \ + nr_pass++; \ + } else { \ + nr_fail++; \ + fprintf(stderr, "%s: Assertion failed %s (%x) != %s (%x)\n", T.label, \ + #x, x, #y, y); \ + } \ + } while (0) -int main(int argc, const char **argv) +int +main(int argc, const char **argv) { unsigned nr_pass = 0, nr_fail = 0; diff --git a/src/panfrost/lib/tests/test-clear.c b/src/panfrost/lib/tests/test-clear.c index 81d807d53e4..91199199149 100644 --- a/src/panfrost/lib/tests/test-clear.c +++ b/src/panfrost/lib/tests/test-clear.c @@ -33,10 +33,22 @@ struct test { uint32_t packed[4]; }; -#define RRRR(r) { r, r, r, r } -#define RGRG(r, g) { r, g, r, g } -#define F(r, g, b, a) { .f = { r, g, b, a } } -#define UI(r, g, b, a) { .ui = { r, g, b, a } } +#define RRRR(r) \ + { \ + r, r, r, r \ + } +#define RGRG(r, g) \ + { \ + r, g, r, g \ + } +#define F(r, g, b, a) \ + { \ + .f = { r, g, b, a } \ + } +#define UI(r, g, b, a) \ + { \ + .ui = { r, g, b, a } \ + } #define D (true) #define _ (false) @@ -140,17 +152,23 @@ static const struct test clear_tests[] = { }; /* clang-format on */ -#define ASSERT_EQ(x, y) do { \ - if ((x[0] == y[0]) && (x[1] == y[1]) && (x[2] == y[2]) && (x[3] == y[3])) { \ - nr_pass++; \ - } else { \ - nr_fail++; \ - fprintf(stderr, "%s%s: Assertion failed %s (%08X %08X %08X %08X) != %s (%08X %08X %08X %08X)\n", \ - util_format_short_name(T.format), T.dithered ? " dithered" : "", #x, x[0], x[1], x[2], x[3], #y, y[0], y[1], y[2], y[3]); \ - } \ -} while(0) +#define ASSERT_EQ(x, y) \ + do { \ + if ((x[0] == y[0]) && (x[1] == y[1]) && (x[2] == y[2]) && \ + (x[3] == y[3])) { \ + nr_pass++; \ + } else { \ + nr_fail++; \ + fprintf( \ + stderr, \ + "%s%s: Assertion failed %s (%08X %08X %08X %08X) != %s (%08X %08X %08X %08X)\n", \ + util_format_short_name(T.format), T.dithered ? " dithered" : "", \ + #x, x[0], x[1], x[2], x[3], #y, y[0], y[1], y[2], y[3]); \ + } \ + } while (0) -int main(int argc, const char **argv) +int +main(int argc, const char **argv) { unsigned nr_pass = 0, nr_fail = 0; diff --git a/src/panfrost/lib/tests/test-earlyzs.cpp b/src/panfrost/lib/tests/test-earlyzs.cpp index 8fff5d469ce..872487b808c 100644 --- a/src/panfrost/lib/tests/test-earlyzs.cpp +++ b/src/panfrost/lib/tests/test-earlyzs.cpp @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "pan_earlyzs.h" #include "util/pan_ir.h" +#include "pan_earlyzs.h" #include @@ -34,18 +34,19 @@ * under test, only the external API. So we test only the composition. */ -#define ZS_WRITEMASK BITFIELD_BIT(0) -#define ALPHA2COV BITFIELD_BIT(1) +#define ZS_WRITEMASK BITFIELD_BIT(0) +#define ALPHA2COV BITFIELD_BIT(1) #define ZS_ALWAYS_PASSES BITFIELD_BIT(2) -#define DISCARD BITFIELD_BIT(3) -#define WRITES_Z BITFIELD_BIT(4) -#define WRITES_S BITFIELD_BIT(5) -#define WRITES_COV BITFIELD_BIT(6) -#define SIDEFX BITFIELD_BIT(7) -#define API_EARLY BITFIELD_BIT(8) +#define DISCARD BITFIELD_BIT(3) +#define WRITES_Z BITFIELD_BIT(4) +#define WRITES_S BITFIELD_BIT(5) +#define WRITES_COV BITFIELD_BIT(6) +#define SIDEFX BITFIELD_BIT(7) +#define API_EARLY BITFIELD_BIT(8) static void -test(enum pan_earlyzs expected_update, enum pan_earlyzs expected_kill, uint32_t flags) +test(enum pan_earlyzs expected_update, enum pan_earlyzs expected_kill, + uint32_t flags) { struct pan_shader_info info = {}; info.fs.can_discard = !!(flags & DISCARD); @@ -56,18 +57,15 @@ test(enum pan_earlyzs expected_update, enum pan_earlyzs expected_kill, uint32_t info.writes_global = !!(flags & SIDEFX); struct pan_earlyzs_state result = - pan_earlyzs_get(pan_earlyzs_analyze(&info), - !!(flags & ZS_WRITEMASK), - !!(flags & ALPHA2COV), - !!(flags & ZS_ALWAYS_PASSES)); + pan_earlyzs_get(pan_earlyzs_analyze(&info), !!(flags & ZS_WRITEMASK), + !!(flags & ALPHA2COV), !!(flags & ZS_ALWAYS_PASSES)); ASSERT_EQ(result.update, expected_update); ASSERT_EQ(result.kill, expected_kill); } - -#define CASE(expected_update, expected_kill, flags) \ - test(PAN_EARLYZS_ ## expected_update, PAN_EARLYZS_ ## expected_kill, flags) +#define CASE(expected_update, expected_kill, flags) \ + test(PAN_EARLYZS_##expected_update, PAN_EARLYZS_##expected_kill, flags) TEST(EarlyZS, APIForceEarly) { @@ -91,7 +89,8 @@ TEST(EarlyZS, ModifiesCoverageWritesZSNoSideFX) CASE(FORCE_LATE, FORCE_EARLY, ZS_WRITEMASK | WRITES_COV); CASE(FORCE_LATE, FORCE_EARLY, ZS_WRITEMASK | DISCARD); CASE(FORCE_LATE, FORCE_EARLY, ZS_WRITEMASK | ALPHA2COV); - CASE(FORCE_LATE, FORCE_EARLY, ZS_WRITEMASK | WRITES_COV | DISCARD | ALPHA2COV); + CASE(FORCE_LATE, FORCE_EARLY, + ZS_WRITEMASK | WRITES_COV | DISCARD | ALPHA2COV); } TEST(EarlyZS, ModifiesCoverageWritesZSNoSideFXAlt) @@ -99,7 +98,8 @@ TEST(EarlyZS, ModifiesCoverageWritesZSNoSideFXAlt) CASE(FORCE_LATE, WEAK_EARLY, ZS_ALWAYS_PASSES | ZS_WRITEMASK | WRITES_COV); CASE(FORCE_LATE, WEAK_EARLY, ZS_ALWAYS_PASSES | ZS_WRITEMASK | DISCARD); CASE(FORCE_LATE, WEAK_EARLY, ZS_ALWAYS_PASSES | ZS_WRITEMASK | ALPHA2COV); - CASE(FORCE_LATE, WEAK_EARLY, ZS_ALWAYS_PASSES | ZS_WRITEMASK | WRITES_COV | DISCARD | ALPHA2COV); + CASE(FORCE_LATE, WEAK_EARLY, + ZS_ALWAYS_PASSES | ZS_WRITEMASK | WRITES_COV | DISCARD | ALPHA2COV); } TEST(EarlyZS, ModifiesCoverageWritesZSSideFX) @@ -107,7 +107,8 @@ TEST(EarlyZS, ModifiesCoverageWritesZSSideFX) CASE(FORCE_LATE, FORCE_LATE, ZS_WRITEMASK | SIDEFX | WRITES_COV); CASE(FORCE_LATE, FORCE_LATE, ZS_WRITEMASK | SIDEFX | DISCARD); CASE(FORCE_LATE, FORCE_LATE, ZS_WRITEMASK | SIDEFX | ALPHA2COV); - CASE(FORCE_LATE, FORCE_LATE, ZS_WRITEMASK | SIDEFX | WRITES_COV | DISCARD | ALPHA2COV); + CASE(FORCE_LATE, FORCE_LATE, + ZS_WRITEMASK | SIDEFX | WRITES_COV | DISCARD | ALPHA2COV); } TEST(EarlyZS, SideFXNoShaderZS) @@ -136,6 +137,7 @@ TEST(EarlyZS, NoSideFXNoShaderZS) TEST(EarlyZS, NoSideFXNoShaderZSAlt) { CASE(WEAK_EARLY, WEAK_EARLY, ZS_ALWAYS_PASSES); - CASE(WEAK_EARLY, WEAK_EARLY, ZS_ALWAYS_PASSES | ALPHA2COV | DISCARD | WRITES_COV); + CASE(WEAK_EARLY, WEAK_EARLY, + ZS_ALWAYS_PASSES | ALPHA2COV | DISCARD | WRITES_COV); CASE(WEAK_EARLY, WEAK_EARLY, ZS_ALWAYS_PASSES | ZS_WRITEMASK); } diff --git a/src/panfrost/lib/tests/test-layout.cpp b/src/panfrost/lib/tests/test-layout.cpp index 1ba7938efec..f47337a64f4 100644 --- a/src/panfrost/lib/tests/test-layout.cpp +++ b/src/panfrost/lib/tests/test-layout.cpp @@ -27,15 +27,13 @@ TEST(BlockSize, Linear) { - enum pipe_format format[] = { - PIPE_FORMAT_R32G32B32_FLOAT, - PIPE_FORMAT_R8G8B8_UNORM, - PIPE_FORMAT_ETC2_RGB8, - PIPE_FORMAT_ASTC_5x5 - }; + enum pipe_format format[] = {PIPE_FORMAT_R32G32B32_FLOAT, + PIPE_FORMAT_R8G8B8_UNORM, PIPE_FORMAT_ETC2_RGB8, + PIPE_FORMAT_ASTC_5x5}; for (unsigned i = 0; i < ARRAY_SIZE(format); ++i) { - struct pan_block_size blk = panfrost_block_size(DRM_FORMAT_MOD_LINEAR, format[i]); + struct pan_block_size blk = + panfrost_block_size(DRM_FORMAT_MOD_LINEAR, format[i]); EXPECT_EQ(blk.width, 1); EXPECT_EQ(blk.height, 1); @@ -50,7 +48,8 @@ TEST(BlockSize, UInterleavedRegular) }; for (unsigned i = 0; i < ARRAY_SIZE(format); ++i) { - struct pan_block_size blk = panfrost_block_size(DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, format[i]); + struct pan_block_size blk = panfrost_block_size( + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, format[i]); EXPECT_EQ(blk.width, 16); EXPECT_EQ(blk.height, 16); @@ -59,13 +58,11 @@ TEST(BlockSize, UInterleavedRegular) TEST(BlockSize, UInterleavedBlockCompressed) { - enum pipe_format format[] = { - PIPE_FORMAT_ETC2_RGB8, - PIPE_FORMAT_ASTC_5x5 - }; + enum pipe_format format[] = {PIPE_FORMAT_ETC2_RGB8, PIPE_FORMAT_ASTC_5x5}; for (unsigned i = 0; i < ARRAY_SIZE(format); ++i) { - struct pan_block_size blk = panfrost_block_size(DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, format[i]); + struct pan_block_size blk = panfrost_block_size( + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, format[i]); EXPECT_EQ(blk.width, 4); EXPECT_EQ(blk.height, 4); @@ -74,17 +71,13 @@ TEST(BlockSize, UInterleavedBlockCompressed) TEST(BlockSize, AFBCFormatInvariant16x16) { - enum pipe_format format[] = { - PIPE_FORMAT_R32G32B32_FLOAT, - PIPE_FORMAT_R8G8B8_UNORM, - PIPE_FORMAT_ETC2_RGB8, - PIPE_FORMAT_ASTC_5x5 - }; + enum pipe_format format[] = {PIPE_FORMAT_R32G32B32_FLOAT, + PIPE_FORMAT_R8G8B8_UNORM, PIPE_FORMAT_ETC2_RGB8, + PIPE_FORMAT_ASTC_5x5}; - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC( - AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_SPARSE | - AFBC_FORMAT_MOD_YTR); + uint64_t modifier = + DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | + AFBC_FORMAT_MOD_SPARSE | AFBC_FORMAT_MOD_YTR); for (unsigned i = 0; i < ARRAY_SIZE(format); ++i) { struct pan_block_size blk = panfrost_block_size(modifier, format[i]); @@ -96,17 +89,13 @@ TEST(BlockSize, AFBCFormatInvariant16x16) TEST(BlockSize, AFBCFormatInvariant32x8) { - enum pipe_format format[] = { - PIPE_FORMAT_R32G32B32_FLOAT, - PIPE_FORMAT_R8G8B8_UNORM, - PIPE_FORMAT_ETC2_RGB8, - PIPE_FORMAT_ASTC_5x5 - }; + enum pipe_format format[] = {PIPE_FORMAT_R32G32B32_FLOAT, + PIPE_FORMAT_R8G8B8_UNORM, PIPE_FORMAT_ETC2_RGB8, + PIPE_FORMAT_ASTC_5x5}; - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC( - AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 | - AFBC_FORMAT_MOD_SPARSE | - AFBC_FORMAT_MOD_YTR); + uint64_t modifier = + DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 | + AFBC_FORMAT_MOD_SPARSE | AFBC_FORMAT_MOD_YTR); for (unsigned i = 0; i < ARRAY_SIZE(format); ++i) { struct pan_block_size blk = panfrost_block_size(modifier, format[i]); @@ -118,10 +107,9 @@ TEST(BlockSize, AFBCFormatInvariant32x8) TEST(BlockSize, AFBCSuperblock16x16) { - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC( - AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_SPARSE | - AFBC_FORMAT_MOD_YTR); + uint64_t modifier = + DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | + AFBC_FORMAT_MOD_SPARSE | AFBC_FORMAT_MOD_YTR); EXPECT_EQ(panfrost_afbc_superblock_size(modifier).width, 16); EXPECT_EQ(panfrost_afbc_superblock_width(modifier), 16); @@ -134,9 +122,8 @@ TEST(BlockSize, AFBCSuperblock16x16) TEST(BlockSize, AFBCSuperblock32x8) { - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC( - AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 | - AFBC_FORMAT_MOD_SPARSE); + uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 | + AFBC_FORMAT_MOD_SPARSE); EXPECT_EQ(panfrost_afbc_superblock_size(modifier).width, 32); EXPECT_EQ(panfrost_afbc_superblock_width(modifier), 32); @@ -149,9 +136,8 @@ TEST(BlockSize, AFBCSuperblock32x8) TEST(BlockSize, AFBCSuperblock64x4) { - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC( - AFBC_FORMAT_MOD_BLOCK_SIZE_64x4 | - AFBC_FORMAT_MOD_SPARSE); + uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_64x4 | + AFBC_FORMAT_MOD_SPARSE); EXPECT_EQ(panfrost_afbc_superblock_size(modifier).width, 64); EXPECT_EQ(panfrost_afbc_superblock_width(modifier), 64); @@ -165,9 +151,11 @@ TEST(BlockSize, AFBCSuperblock64x4) /* Calculate Bifrost line stride, since we have reference formulas for Bifrost * stride calculations. */ -static uint32_t pan_afbc_line_stride(uint64_t modifier, uint32_t width) +static uint32_t +pan_afbc_line_stride(uint64_t modifier, uint32_t width) { - return pan_afbc_stride_blocks(modifier, pan_afbc_row_stride(modifier, width)); + return pan_afbc_stride_blocks(modifier, + pan_afbc_row_stride(modifier, width)); } /* Which form of the stride we specify is hardware specific (row stride for @@ -189,16 +177,16 @@ TEST(AFBCStride, Linear) uint64_t modifier = modifiers[m]; uint32_t sw = panfrost_afbc_superblock_width(modifier); - uint32_t cases[] = { 1, 4, 17, 39 }; + uint32_t cases[] = {1, 4, 17, 39}; for (unsigned i = 0; i < ARRAY_SIZE(cases); ++i) { uint32_t width = sw * cases[i]; EXPECT_EQ(pan_afbc_row_stride(modifier, width), - 16 * DIV_ROUND_UP(width, sw)); + 16 * DIV_ROUND_UP(width, sw)); EXPECT_EQ(pan_afbc_line_stride(modifier, width), - DIV_ROUND_UP(width, sw)); + DIV_ROUND_UP(width, sw)); } } } @@ -207,63 +195,73 @@ TEST(AFBCStride, Tiled) { uint64_t modifiers[] = { DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_TILED | - AFBC_FORMAT_MOD_SPARSE), + AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SPARSE), DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 | - AFBC_FORMAT_MOD_TILED | - AFBC_FORMAT_MOD_SPARSE), + AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SPARSE), DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_64x4 | - AFBC_FORMAT_MOD_TILED | - AFBC_FORMAT_MOD_SPARSE), + AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SPARSE), }; for (unsigned m = 0; m < ARRAY_SIZE(modifiers); ++m) { uint64_t modifier = modifiers[m]; uint32_t sw = panfrost_afbc_superblock_width(modifier); - uint32_t cases[] = { 1, 4, 17, 39 }; + uint32_t cases[] = {1, 4, 17, 39}; for (unsigned i = 0; i < ARRAY_SIZE(cases); ++i) { uint32_t width = sw * 8 * cases[i]; EXPECT_EQ(pan_afbc_row_stride(modifier, width), - 16 * DIV_ROUND_UP(width, (sw * 8)) * 8 * 8); + 16 * DIV_ROUND_UP(width, (sw * 8)) * 8 * 8); EXPECT_EQ(pan_afbc_line_stride(modifier, width), - DIV_ROUND_UP(width, sw * 8) * 8); + DIV_ROUND_UP(width, sw * 8) * 8); } } } TEST(LegacyStride, FromLegacyLinear) { - EXPECT_EQ(panfrost_from_legacy_stride(1920 * 4, PIPE_FORMAT_R8G8B8A8_UINT, DRM_FORMAT_MOD_LINEAR), 1920 * 4); - EXPECT_EQ(panfrost_from_legacy_stride(53, PIPE_FORMAT_R8_SNORM, DRM_FORMAT_MOD_LINEAR), 53); - EXPECT_EQ(panfrost_from_legacy_stride(60, PIPE_FORMAT_ETC2_RGB8, DRM_FORMAT_MOD_LINEAR), 60); + EXPECT_EQ(panfrost_from_legacy_stride(1920 * 4, PIPE_FORMAT_R8G8B8A8_UINT, + DRM_FORMAT_MOD_LINEAR), + 1920 * 4); + EXPECT_EQ(panfrost_from_legacy_stride(53, PIPE_FORMAT_R8_SNORM, + DRM_FORMAT_MOD_LINEAR), + 53); + EXPECT_EQ(panfrost_from_legacy_stride(60, PIPE_FORMAT_ETC2_RGB8, + DRM_FORMAT_MOD_LINEAR), + 60); } TEST(LegacyStride, FromLegacyInterleaved) { - EXPECT_EQ(panfrost_from_legacy_stride(1920 * 4, PIPE_FORMAT_R8G8B8A8_UINT, - DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED), - 1920 * 4 * 16); + EXPECT_EQ( + panfrost_from_legacy_stride(1920 * 4, PIPE_FORMAT_R8G8B8A8_UINT, + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED), + 1920 * 4 * 16); - EXPECT_EQ(panfrost_from_legacy_stride(53, PIPE_FORMAT_R8_SNORM, - DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED), 53 * 16); + EXPECT_EQ( + panfrost_from_legacy_stride(53, PIPE_FORMAT_R8_SNORM, + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED), + 53 * 16); - EXPECT_EQ(panfrost_from_legacy_stride(60, PIPE_FORMAT_ETC2_RGB8, - DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED), 60 * 4); + EXPECT_EQ( + panfrost_from_legacy_stride(60, PIPE_FORMAT_ETC2_RGB8, + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED), + 60 * 4); } TEST(LegacyStride, FromLegacyAFBC) { - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC( - AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 | - AFBC_FORMAT_MOD_SPARSE | - AFBC_FORMAT_MOD_YTR); + uint64_t modifier = + DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 | + AFBC_FORMAT_MOD_SPARSE | AFBC_FORMAT_MOD_YTR); - EXPECT_EQ(panfrost_from_legacy_stride(1920 * 4, PIPE_FORMAT_R8G8B8A8_UINT, modifier), 60 * 16); - EXPECT_EQ(panfrost_from_legacy_stride(64, PIPE_FORMAT_R8_SNORM, modifier), 2 * 16); + EXPECT_EQ(panfrost_from_legacy_stride(1920 * 4, PIPE_FORMAT_R8G8B8A8_UINT, + modifier), + 60 * 16); + EXPECT_EQ(panfrost_from_legacy_stride(64, PIPE_FORMAT_R8_SNORM, modifier), + 2 * 16); } /* dEQP-GLES3.functional.texture.format.compressed.etc1_2d_pot */ @@ -277,12 +275,10 @@ TEST(Layout, ImplicitLayoutInterleavedETC2) .depth = 1, .nr_samples = 1, .dim = MALI_TEXTURE_DIMENSION_2D, - .nr_slices = 8 - }; + .nr_slices = 8}; - unsigned offsets[9] = { - 0, 8192, 10240, 10752, 10880, 11008, 11136, 11264, 11392 - }; + unsigned offsets[9] = {0, 8192, 10240, 10752, 10880, + 11008, 11136, 11264, 11392}; ASSERT_TRUE(pan_image_layout_init(&l, NULL)); @@ -307,8 +303,7 @@ TEST(Layout, ImplicitLayoutInterleavedASTC5x5) .depth = 1, .nr_samples = 1, .dim = MALI_TEXTURE_DIMENSION_2D, - .nr_slices = 1 - }; + .nr_slices = 1}; ASSERT_TRUE(pan_image_layout_init(&l, NULL)); @@ -326,16 +321,14 @@ TEST(Layout, ImplicitLayoutInterleavedASTC5x5) TEST(Layout, ImplicitLayoutLinearASTC5x5) { - struct pan_image_layout l = { - .modifier = DRM_FORMAT_MOD_LINEAR, - .format = PIPE_FORMAT_ASTC_5x5, - .width = 50, - .height = 50, - .depth = 1, - .nr_samples = 1, - .dim = MALI_TEXTURE_DIMENSION_2D, - .nr_slices = 1 - }; + struct pan_image_layout l = {.modifier = DRM_FORMAT_MOD_LINEAR, + .format = PIPE_FORMAT_ASTC_5x5, + .width = 50, + .height = 50, + .depth = 1, + .nr_samples = 1, + .dim = MALI_TEXTURE_DIMENSION_2D, + .nr_slices = 1}; ASSERT_TRUE(pan_image_layout_init(&l, NULL)); @@ -353,25 +346,23 @@ TEST(Layout, ImplicitLayoutLinearASTC5x5) /* dEQP-GLES3.functional.texture.format.unsized.rgba_unsigned_byte_3d_pot */ TEST(AFBCLayout, Linear3D) { - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_SPARSE); + uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC( + AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | AFBC_FORMAT_MOD_SPARSE); - struct pan_image_layout l = { - .modifier = modifier, - .format = PIPE_FORMAT_R8G8B8A8_UNORM, - .width = 8, - .height = 32, - .depth = 16, - .nr_samples = 1, - .dim = MALI_TEXTURE_DIMENSION_3D, - .nr_slices = 1 - }; + struct pan_image_layout l = {.modifier = modifier, + .format = PIPE_FORMAT_R8G8B8A8_UNORM, + .width = 8, + .height = 32, + .depth = 16, + .nr_samples = 1, + .dim = MALI_TEXTURE_DIMENSION_3D, + .nr_slices = 1}; ASSERT_TRUE(pan_image_layout_init(&l, NULL)); /* AFBC Surface stride is bytes between consecutive surface headers, which is - * the header size since this is a 3D texture. At superblock size 16x16, the 8x32 - * layer has 1x2 superblocks, so the header size is 2 * 16 = 32 bytes, + * the header size since this is a 3D texture. At superblock size 16x16, the + * 8x32 layer has 1x2 superblocks, so the header size is 2 * 16 = 32 bytes, * rounded up to cache line 64. * * There is only 1 superblock per row, so the row stride is the bytes per 1 @@ -393,20 +384,18 @@ TEST(AFBCLayout, Linear3D) TEST(AFBCLayout, Tiled16x16) { - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_TILED | - AFBC_FORMAT_MOD_SPARSE); + uint64_t modifier = + DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | + AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SPARSE); - struct pan_image_layout l = { - .modifier = modifier, - .format = PIPE_FORMAT_R8G8B8A8_UNORM, - .width = 917, - .height = 417, - .depth = 1, - .nr_samples = 1, - .dim = MALI_TEXTURE_DIMENSION_2D, - .nr_slices = 1 - }; + struct pan_image_layout l = {.modifier = modifier, + .format = PIPE_FORMAT_R8G8B8A8_UNORM, + .width = 917, + .height = 417, + .depth = 1, + .nr_samples = 1, + .dim = MALI_TEXTURE_DIMENSION_2D, + .nr_slices = 1}; ASSERT_TRUE(pan_image_layout_init(&l, NULL)); @@ -432,19 +421,17 @@ TEST(AFBCLayout, Tiled16x16) TEST(AFBCLayout, Linear16x16Minimal) { - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_SPARSE); + uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC( + AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | AFBC_FORMAT_MOD_SPARSE); - struct pan_image_layout l = { - .modifier = modifier, - .format = PIPE_FORMAT_R8_UNORM, - .width = 1, - .height = 1, - .depth = 1, - .nr_samples = 1, - .dim = MALI_TEXTURE_DIMENSION_2D, - .nr_slices = 1 - }; + struct pan_image_layout l = {.modifier = modifier, + .format = PIPE_FORMAT_R8_UNORM, + .width = 1, + .height = 1, + .depth = 1, + .nr_samples = 1, + .dim = MALI_TEXTURE_DIMENSION_2D, + .nr_slices = 1}; ASSERT_TRUE(pan_image_layout_init(&l, NULL)); @@ -459,20 +446,18 @@ TEST(AFBCLayout, Linear16x16Minimal) TEST(AFBCLayout, Tiled16x16Minimal) { - uint64_t modifier = DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | - AFBC_FORMAT_MOD_TILED | - AFBC_FORMAT_MOD_SPARSE); + uint64_t modifier = + DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | + AFBC_FORMAT_MOD_TILED | AFBC_FORMAT_MOD_SPARSE); - struct pan_image_layout l = { - .modifier = modifier, - .format = PIPE_FORMAT_R8_UNORM, - .width = 1, - .height = 1, - .depth = 1, - .nr_samples = 1, - .dim = MALI_TEXTURE_DIMENSION_2D, - .nr_slices = 1 - }; + struct pan_image_layout l = {.modifier = modifier, + .format = PIPE_FORMAT_R8_UNORM, + .width = 1, + .height = 1, + .depth = 1, + .nr_samples = 1, + .dim = MALI_TEXTURE_DIMENSION_2D, + .nr_slices = 1}; ASSERT_TRUE(pan_image_layout_init(&l, NULL)); diff --git a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h index 56bb0f48aed..d4cafa75429 100644 --- a/src/panfrost/lib/wrap.h +++ b/src/panfrost/lib/wrap.h @@ -27,11 +27,11 @@ #ifndef __PAN_DECODE_PUBLIC_H__ #define __PAN_DECODE_PUBLIC_H__ +#include +#include +#include #include #include -#include -#include -#include /* Public entrypoints for the tracing infrastructure. This API should be kept * more or less stable. Don't feel bad if you have to change it; just feel @@ -48,14 +48,13 @@ void pandecode_next_frame(void); void pandecode_close(void); -void -pandecode_inject_mmap(uint64_t gpu_va, void *cpu, unsigned sz, const char *name); +void pandecode_inject_mmap(uint64_t gpu_va, void *cpu, unsigned sz, + const char *name); void pandecode_inject_free(uint64_t gpu_va, unsigned sz); void pandecode_jc(uint64_t jc_gpu_va, unsigned gpu_id); -void -pandecode_abort_on_fault(uint64_t jc_gpu_va, unsigned gpu_id); +void pandecode_abort_on_fault(uint64_t jc_gpu_va, unsigned gpu_id); #endif /* __MMAP_TRACE_H__ */ diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h index 593a4e599a4..96d1a9a1ab6 100644 --- a/src/panfrost/midgard/compiler.h +++ b/src/panfrost/midgard/compiler.h @@ -25,21 +25,21 @@ #ifndef _MDG_COMPILER_H #define _MDG_COMPILER_H -#include "midgard.h" #include "helpers.h" +#include "midgard.h" #include "midgard_compile.h" #include "midgard_ops.h" #include "util/hash_table.h" -#include "util/u_dynarray.h" -#include "util/set.h" #include "util/list.h" +#include "util/set.h" +#include "util/u_dynarray.h" #include "util/u_math.h" -#include "compiler/nir_types.h" #include "compiler/nir/nir.h" -#include "panfrost/util/pan_ir.h" +#include "compiler/nir_types.h" #include "panfrost/util/lcra.h" +#include "panfrost/util/pan_ir.h" /* Forward declare */ struct midgard_block; @@ -48,28 +48,30 @@ struct midgard_block; * the hardware), hence why that must be zero. TARGET_DISCARD signals this * instruction is actually a discard op. */ -#define TARGET_GOTO 0 -#define TARGET_BREAK 1 -#define TARGET_CONTINUE 2 -#define TARGET_DISCARD 3 +#define TARGET_GOTO 0 +#define TARGET_BREAK 1 +#define TARGET_CONTINUE 2 +#define TARGET_DISCARD 3 #define TARGET_TILEBUF_WAIT 4 typedef struct midgard_branch { - /* If conditional, the condition is specified in r31.w */ - bool conditional; + /* If conditional, the condition is specified in r31.w */ + bool conditional; - /* For conditionals, if this is true, we branch on FALSE. If false, we branch on TRUE. */ - bool invert_conditional; + /* For conditionals, if this is true, we branch on FALSE. If false, we branch + * on TRUE. */ + bool invert_conditional; - /* Branch targets: the start of a block, the start of a loop (continue), the end of a loop (break). Value is one of TARGET_ */ - unsigned target_type; + /* Branch targets: the start of a block, the start of a loop (continue), the + * end of a loop (break). Value is one of TARGET_ */ + unsigned target_type; - /* The actual target */ - union { - int target_block; - int target_break; - int target_continue; - }; + /* The actual target */ + union { + int target_block; + int target_break; + int target_continue; + }; } midgard_branch; /* Generic in-memory data type repesenting a single logical instruction, rather @@ -82,233 +84,235 @@ typedef struct midgard_branch { * emitted before the register allocation pass. */ -#define MIR_SRC_COUNT 4 +#define MIR_SRC_COUNT 4 #define MIR_VEC_COMPONENTS 16 typedef struct midgard_instruction { - /* Must be first for casting */ - struct list_head link; + /* Must be first for casting */ + struct list_head link; - unsigned type; /* ALU, load/store, texture */ + unsigned type; /* ALU, load/store, texture */ - /* Instruction arguments represented as block-local SSA - * indices, rather than registers. ~0 means unused. */ - unsigned src[MIR_SRC_COUNT]; - unsigned dest; + /* Instruction arguments represented as block-local SSA + * indices, rather than registers. ~0 means unused. */ + unsigned src[MIR_SRC_COUNT]; + unsigned dest; - /* vec16 swizzle, unpacked, per source */ - unsigned swizzle[MIR_SRC_COUNT][MIR_VEC_COMPONENTS]; + /* vec16 swizzle, unpacked, per source */ + unsigned swizzle[MIR_SRC_COUNT][MIR_VEC_COMPONENTS]; - /* Types! */ - nir_alu_type src_types[MIR_SRC_COUNT]; - nir_alu_type dest_type; + /* Types! */ + nir_alu_type src_types[MIR_SRC_COUNT]; + nir_alu_type dest_type; - /* Packing ops have non-32-bit dest types even though they functionally - * work at the 32-bit level, use this as a signal to disable copyprop. - * We maybe need synthetic pack ops instead. */ - bool is_pack; + /* Packing ops have non-32-bit dest types even though they functionally + * work at the 32-bit level, use this as a signal to disable copyprop. + * We maybe need synthetic pack ops instead. */ + bool is_pack; - /* Modifiers, depending on type */ - union { - struct { - bool src_abs[MIR_SRC_COUNT]; - bool src_neg[MIR_SRC_COUNT]; - }; + /* Modifiers, depending on type */ + union { + struct { + bool src_abs[MIR_SRC_COUNT]; + bool src_neg[MIR_SRC_COUNT]; + }; - struct { - bool src_shift[MIR_SRC_COUNT]; - }; - }; + struct { + bool src_shift[MIR_SRC_COUNT]; + }; + }; - /* Out of the union for csel (could maybe be fixed..) */ - bool src_invert[MIR_SRC_COUNT]; + /* Out of the union for csel (could maybe be fixed..) */ + bool src_invert[MIR_SRC_COUNT]; - /* If the op supports it */ - enum midgard_roundmode roundmode; + /* If the op supports it */ + enum midgard_roundmode roundmode; - /* For textures: should helpers execute this instruction (instead of - * just helping with derivatives)? Should helpers terminate after? */ - bool helper_terminate; - bool helper_execute; + /* For textures: should helpers execute this instruction (instead of + * just helping with derivatives)? Should helpers terminate after? */ + bool helper_terminate; + bool helper_execute; - /* I.e. (1 << alu_bit) */ - int unit; + /* I.e. (1 << alu_bit) */ + int unit; - bool has_constants; - midgard_constants constants; - uint16_t inline_constant; - bool has_inline_constant; + bool has_constants; + midgard_constants constants; + uint16_t inline_constant; + bool has_inline_constant; - bool compact_branch; - uint8_t writeout; - bool last_writeout; + bool compact_branch; + uint8_t writeout; + bool last_writeout; - /* Masks in a saneish format. One bit per channel, not packed fancy. - * Use this instead of the op specific ones, and switch over at emit - * time */ + /* Masks in a saneish format. One bit per channel, not packed fancy. + * Use this instead of the op specific ones, and switch over at emit + * time */ - uint16_t mask; + uint16_t mask; - /* Hint for the register allocator not to spill the destination written - * from this instruction (because it is a spill/unspill node itself). - * Bitmask of spilled classes */ + /* Hint for the register allocator not to spill the destination written + * from this instruction (because it is a spill/unspill node itself). + * Bitmask of spilled classes */ - unsigned no_spill; + unsigned no_spill; - /* Generic hint for intra-pass use */ - bool hint; + /* Generic hint for intra-pass use */ + bool hint; - /* During scheduling, the backwards dependency graph - * (DAG). nr_dependencies is the number of unscheduled - * instructions that must still be scheduled after - * (before) this instruction. dependents are which - * instructions need to be scheduled before (after) this - * instruction. */ + /* During scheduling, the backwards dependency graph + * (DAG). nr_dependencies is the number of unscheduled + * instructions that must still be scheduled after + * (before) this instruction. dependents are which + * instructions need to be scheduled before (after) this + * instruction. */ - unsigned nr_dependencies; - BITSET_WORD *dependents; + unsigned nr_dependencies; + BITSET_WORD *dependents; - /* Use this in conjunction with `type` */ - unsigned op; + /* Use this in conjunction with `type` */ + unsigned op; - /* This refers to midgard_outmod_float or midgard_outmod_int. - * In case of a ALU op, use midgard_is_integer_out_op() to know which - * one is used. - * If it's a texture op, it's always midgard_outmod_float. */ - unsigned outmod; + /* This refers to midgard_outmod_float or midgard_outmod_int. + * In case of a ALU op, use midgard_is_integer_out_op() to know which + * one is used. + * If it's a texture op, it's always midgard_outmod_float. */ + unsigned outmod; - union { - midgard_load_store_word load_store; - midgard_texture_word texture; + union { + midgard_load_store_word load_store; + midgard_texture_word texture; - midgard_branch branch; - }; + midgard_branch branch; + }; - unsigned bundle_id; + unsigned bundle_id; } midgard_instruction; typedef struct midgard_block { - pan_block base; + pan_block base; - bool scheduled; + bool scheduled; - /* List of midgard_bundles emitted (after the scheduler has run) */ - struct util_dynarray bundles; + /* List of midgard_bundles emitted (after the scheduler has run) */ + struct util_dynarray bundles; - /* Number of quadwords _actually_ emitted, as determined after scheduling */ - unsigned quadword_count; + /* Number of quadwords _actually_ emitted, as determined after scheduling */ + unsigned quadword_count; - /* Indicates this is a fixed-function fragment epilogue block */ - bool epilogue; + /* Indicates this is a fixed-function fragment epilogue block */ + bool epilogue; - /* Are helper invocations required by this block? */ - bool helpers_in; + /* Are helper invocations required by this block? */ + bool helpers_in; } midgard_block; typedef struct midgard_bundle { - /* Tag for the overall bundle */ - int tag; + /* Tag for the overall bundle */ + int tag; - /* Instructions contained by the bundle. instruction_count <= 6 (vmul, - * sadd, vadd, smul, vlut, branch) */ - int instruction_count; - midgard_instruction *instructions[6]; + /* Instructions contained by the bundle. instruction_count <= 6 (vmul, + * sadd, vadd, smul, vlut, branch) */ + int instruction_count; + midgard_instruction *instructions[6]; - /* Bundle-wide ALU configuration */ - int padding; - int control; - bool has_embedded_constants; - midgard_constants constants; - bool last_writeout; + /* Bundle-wide ALU configuration */ + int padding; + int control; + bool has_embedded_constants; + midgard_constants constants; + bool last_writeout; } midgard_bundle; enum midgard_rt_id { - MIDGARD_COLOR_RT0 = 0, - MIDGARD_COLOR_RT1, - MIDGARD_COLOR_RT2, - MIDGARD_COLOR_RT3, - MIDGARD_COLOR_RT4, - MIDGARD_COLOR_RT5, - MIDGARD_COLOR_RT6, - MIDGARD_COLOR_RT7, - MIDGARD_ZS_RT, - MIDGARD_NUM_RTS, + MIDGARD_COLOR_RT0 = 0, + MIDGARD_COLOR_RT1, + MIDGARD_COLOR_RT2, + MIDGARD_COLOR_RT3, + MIDGARD_COLOR_RT4, + MIDGARD_COLOR_RT5, + MIDGARD_COLOR_RT6, + MIDGARD_COLOR_RT7, + MIDGARD_ZS_RT, + MIDGARD_NUM_RTS, }; #define MIDGARD_MAX_SAMPLE_ITER 16 typedef struct compiler_context { - const struct panfrost_compile_inputs *inputs; - nir_shader *nir; - struct pan_shader_info *info; - gl_shader_stage stage; + const struct panfrost_compile_inputs *inputs; + nir_shader *nir; + struct pan_shader_info *info; + gl_shader_stage stage; - /* Number of samples for a keyed blend shader. Depends on is_blend */ - unsigned blend_sample_iterations; + /* Number of samples for a keyed blend shader. Depends on is_blend */ + unsigned blend_sample_iterations; - /* Index to precolour to r0 for an input blend colour */ - unsigned blend_input; + /* Index to precolour to r0 for an input blend colour */ + unsigned blend_input; - /* Index to precolour to r2 for a dual-source blend colour */ - unsigned blend_src1; + /* Index to precolour to r2 for a dual-source blend colour */ + unsigned blend_src1; - /* Count of spills and fills for shaderdb */ - unsigned spills; - unsigned fills; + /* Count of spills and fills for shaderdb */ + unsigned spills; + unsigned fills; - /* Current NIR function */ - nir_function *func; + /* Current NIR function */ + nir_function *func; - /* Allocated compiler temporary counter */ - unsigned temp_alloc; + /* Allocated compiler temporary counter */ + unsigned temp_alloc; - /* Unordered list of midgard_blocks */ - int block_count; - struct list_head blocks; + /* Unordered list of midgard_blocks */ + int block_count; + struct list_head blocks; - /* TODO merge with block_count? */ - unsigned block_source_count; + /* TODO merge with block_count? */ + unsigned block_source_count; - /* List of midgard_instructions emitted for the current block */ - midgard_block *current_block; + /* List of midgard_instructions emitted for the current block */ + midgard_block *current_block; - /* If there is a preset after block, use this, otherwise emit_block will create one if NULL */ - midgard_block *after_block; + /* If there is a preset after block, use this, otherwise emit_block will + * create one if NULL */ + midgard_block *after_block; - /* The current "depth" of the loop, for disambiguating breaks/continues - * when using nested loops */ - int current_loop_depth; + /* The current "depth" of the loop, for disambiguating breaks/continues + * when using nested loops */ + int current_loop_depth; - /* Total number of loops for shader-db */ - unsigned loop_count; + /* Total number of loops for shader-db */ + unsigned loop_count; - /* Constants which have been loaded, for later inlining */ - struct hash_table_u64 *ssa_constants; + /* Constants which have been loaded, for later inlining */ + struct hash_table_u64 *ssa_constants; - int temp_count; - int max_hash; + int temp_count; + int max_hash; - /* Set of NIR indices that were already emitted as outmods */ - BITSET_WORD *already_emitted; + /* Set of NIR indices that were already emitted as outmods */ + BITSET_WORD *already_emitted; - /* Count of instructions emitted from NIR overall, across all blocks */ - int instruction_count; + /* Count of instructions emitted from NIR overall, across all blocks */ + int instruction_count; - unsigned quadword_count; + unsigned quadword_count; - /* Bitmask of valid metadata */ - unsigned metadata; + /* Bitmask of valid metadata */ + unsigned metadata; - /* Model-specific quirk set */ - uint32_t quirks; + /* Model-specific quirk set */ + uint32_t quirks; - /* Writeout instructions for each render target */ - midgard_instruction *writeout_branch[MIDGARD_NUM_RTS][MIDGARD_MAX_SAMPLE_ITER]; + /* Writeout instructions for each render target */ + midgard_instruction + *writeout_branch[MIDGARD_NUM_RTS][MIDGARD_MAX_SAMPLE_ITER]; - struct hash_table_u64 *sysval_to_id; + struct hash_table_u64 *sysval_to_id; - /* Mask of UBOs that need to be uploaded */ - uint32_t ubo_mask; + /* Mask of UBOs that need to be uploaded */ + uint32_t ubo_mask; } compiler_context; /* Per-block live_in/live_out */ @@ -321,17 +325,18 @@ typedef struct compiler_context { static inline midgard_instruction * mir_upload_ins(struct compiler_context *ctx, struct midgard_instruction ins) { - midgard_instruction *heap = ralloc(ctx, struct midgard_instruction); - memcpy(heap, &ins, sizeof(ins)); - return heap; + midgard_instruction *heap = ralloc(ctx, struct midgard_instruction); + memcpy(heap, &ins, sizeof(ins)); + return heap; } static inline midgard_instruction * -emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins) +emit_mir_instruction(struct compiler_context *ctx, + struct midgard_instruction ins) { - midgard_instruction *u = mir_upload_ins(ctx, ins); - list_addtail(&u->link, &ctx->current_block->base.instructions); - return u; + midgard_instruction *u = mir_upload_ins(ctx, ins); + list_addtail(&u->link, &ctx->current_block->base.instructions); + return u; } static inline struct midgard_instruction * @@ -339,165 +344,174 @@ mir_insert_instruction_before(struct compiler_context *ctx, struct midgard_instruction *tag, struct midgard_instruction ins) { - struct midgard_instruction *u = mir_upload_ins(ctx, ins); - list_addtail(&u->link, &tag->link); - return u; + struct midgard_instruction *u = mir_upload_ins(ctx, ins); + list_addtail(&u->link, &tag->link); + return u; } static inline void mir_remove_instruction(struct midgard_instruction *ins) { - list_del(&ins->link); + list_del(&ins->link); } -static inline midgard_instruction* +static inline midgard_instruction * mir_prev_op(struct midgard_instruction *ins) { - return list_last_entry(&(ins->link), midgard_instruction, link); + return list_last_entry(&(ins->link), midgard_instruction, link); } -static inline midgard_instruction* +static inline midgard_instruction * mir_next_op(struct midgard_instruction *ins) { - return list_first_entry(&(ins->link), midgard_instruction, link); + return list_first_entry(&(ins->link), midgard_instruction, link); } -#define mir_foreach_block(ctx, v) \ - list_for_each_entry(pan_block, v, &ctx->blocks, link) +#define mir_foreach_block(ctx, v) \ + list_for_each_entry(pan_block, v, &ctx->blocks, link) -#define mir_foreach_block_from(ctx, from, v) \ - list_for_each_entry_from(pan_block, v, &from->base, &ctx->blocks, link) +#define mir_foreach_block_from(ctx, from, v) \ + list_for_each_entry_from(pan_block, v, &from->base, &ctx->blocks, link) -#define mir_foreach_instr_in_block(block, v) \ - list_for_each_entry(struct midgard_instruction, v, &block->base.instructions, link) -#define mir_foreach_instr_in_block_rev(block, v) \ - list_for_each_entry_rev(struct midgard_instruction, v, &block->base.instructions, link) +#define mir_foreach_instr_in_block(block, v) \ + list_for_each_entry(struct midgard_instruction, v, \ + &block->base.instructions, link) +#define mir_foreach_instr_in_block_rev(block, v) \ + list_for_each_entry_rev(struct midgard_instruction, v, \ + &block->base.instructions, link) -#define mir_foreach_instr_in_block_safe(block, v) \ - list_for_each_entry_safe(struct midgard_instruction, v, &block->base.instructions, link) +#define mir_foreach_instr_in_block_safe(block, v) \ + list_for_each_entry_safe(struct midgard_instruction, v, \ + &block->base.instructions, link) -#define mir_foreach_instr_in_block_safe_rev(block, v) \ - list_for_each_entry_safe_rev(struct midgard_instruction, v, &block->base.instructions, link) +#define mir_foreach_instr_in_block_safe_rev(block, v) \ + list_for_each_entry_safe_rev(struct midgard_instruction, v, \ + &block->base.instructions, link) -#define mir_foreach_instr_in_block_from(block, v, from) \ - list_for_each_entry_from(struct midgard_instruction, v, from, &block->base.instructions, link) +#define mir_foreach_instr_in_block_from(block, v, from) \ + list_for_each_entry_from(struct midgard_instruction, v, from, \ + &block->base.instructions, link) -#define mir_foreach_instr_in_block_from_rev(block, v, from) \ - list_for_each_entry_from_rev(struct midgard_instruction, v, from, &block->base.instructions, link) +#define mir_foreach_instr_in_block_from_rev(block, v, from) \ + list_for_each_entry_from_rev(struct midgard_instruction, v, from, \ + &block->base.instructions, link) -#define mir_foreach_bundle_in_block(block, v) \ - util_dynarray_foreach(&block->bundles, midgard_bundle, v) +#define mir_foreach_bundle_in_block(block, v) \ + util_dynarray_foreach(&block->bundles, midgard_bundle, v) -#define mir_foreach_bundle_in_block_rev(block, v) \ - util_dynarray_foreach_reverse(&block->bundles, midgard_bundle, v) +#define mir_foreach_bundle_in_block_rev(block, v) \ + util_dynarray_foreach_reverse(&block->bundles, midgard_bundle, v) -#define mir_foreach_instr_in_block_scheduled_rev(block, v) \ - midgard_instruction* v; \ - signed i = 0; \ - mir_foreach_bundle_in_block_rev(block, _bundle) \ - for (i = (_bundle->instruction_count - 1), v = _bundle->instructions[i]; \ - i >= 0; \ - --i, v = (i >= 0) ? _bundle->instructions[i] : NULL) \ +#define mir_foreach_instr_in_block_scheduled_rev(block, v) \ + midgard_instruction *v; \ + signed i = 0; \ + mir_foreach_bundle_in_block_rev(block, _bundle) \ + for (i = (_bundle->instruction_count - 1), v = _bundle->instructions[i]; \ + i >= 0; --i, v = (i >= 0) ? _bundle->instructions[i] : NULL) -#define mir_foreach_instr_global(ctx, v) \ - mir_foreach_block(ctx, v_block) \ - mir_foreach_instr_in_block(((midgard_block *) v_block), v) +#define mir_foreach_instr_global(ctx, v) \ + mir_foreach_block(ctx, v_block) \ + mir_foreach_instr_in_block(((midgard_block *)v_block), v) -#define mir_foreach_instr_global_safe(ctx, v) \ - mir_foreach_block(ctx, v_block) \ - mir_foreach_instr_in_block_safe(((midgard_block *) v_block), v) +#define mir_foreach_instr_global_safe(ctx, v) \ + mir_foreach_block(ctx, v_block) \ + mir_foreach_instr_in_block_safe(((midgard_block *)v_block), v) /* Based on set_foreach, expanded with automatic type casts */ -#define mir_foreach_predecessor(blk, v) \ - struct set_entry *_entry_##v; \ - struct midgard_block *v; \ - for (_entry_##v = _mesa_set_next_entry(blk->base.predecessors, NULL), \ - v = (struct midgard_block *) (_entry_##v ? _entry_##v->key : NULL); \ - _entry_##v != NULL; \ - _entry_##v = _mesa_set_next_entry(blk->base.predecessors, _entry_##v), \ - v = (struct midgard_block *) (_entry_##v ? _entry_##v->key : NULL)) +#define mir_foreach_predecessor(blk, v) \ + struct set_entry *_entry_##v; \ + struct midgard_block *v; \ + for (_entry_##v = _mesa_set_next_entry(blk->base.predecessors, NULL), \ + v = (struct midgard_block *)(_entry_##v ? _entry_##v->key : NULL); \ + _entry_##v != NULL; \ + _entry_##v = _mesa_set_next_entry(blk->base.predecessors, _entry_##v), \ + v = (struct midgard_block *)(_entry_##v ? _entry_##v->key : NULL)) -#define mir_foreach_src(ins, v) \ - for (unsigned v = 0; v < ARRAY_SIZE(ins->src); ++v) +#define mir_foreach_src(ins, v) \ + for (unsigned v = 0; v < ARRAY_SIZE(ins->src); ++v) static inline midgard_instruction * mir_last_in_block(struct midgard_block *block) { - return list_last_entry(&block->base.instructions, struct midgard_instruction, link); + return list_last_entry(&block->base.instructions, struct midgard_instruction, + link); } static inline midgard_block * mir_get_block(compiler_context *ctx, int idx) { - struct list_head *lst = &ctx->blocks; + struct list_head *lst = &ctx->blocks; - while ((idx--) + 1) - lst = lst->next; + while ((idx--) + 1) + lst = lst->next; - return (struct midgard_block *) lst; + return (struct midgard_block *)lst; } static inline bool mir_is_alu_bundle(midgard_bundle *bundle) { - return IS_ALU(bundle->tag); + return IS_ALU(bundle->tag); } static inline unsigned make_compiler_temp(compiler_context *ctx) { - return (ctx->func->impl->ssa_alloc + ctx->temp_alloc++) << 1; + return (ctx->func->impl->ssa_alloc + ctx->temp_alloc++) << 1; } static inline unsigned make_compiler_temp_reg(compiler_context *ctx) { - return ((ctx->func->impl->reg_alloc + ctx->temp_alloc++) << 1) | PAN_IS_REG; + return ((ctx->func->impl->reg_alloc + ctx->temp_alloc++) << 1) | PAN_IS_REG; } static inline unsigned nir_ssa_index(nir_ssa_def *ssa) { - return (ssa->index << 1) | 0; + return (ssa->index << 1) | 0; } static inline unsigned nir_src_index(compiler_context *ctx, nir_src *src) { - if (src->is_ssa) - return nir_ssa_index(src->ssa); - else { - assert(!src->reg.indirect); - return (src->reg.reg->index << 1) | PAN_IS_REG; - } + if (src->is_ssa) + return nir_ssa_index(src->ssa); + else { + assert(!src->reg.indirect); + return (src->reg.reg->index << 1) | PAN_IS_REG; + } } static inline unsigned nir_dest_index(nir_dest *dst) { - if (dst->is_ssa) - return (dst->ssa.index << 1) | 0; - else { - assert(!dst->reg.indirect); - return (dst->reg.reg->index << 1) | PAN_IS_REG; - } + if (dst->is_ssa) + return (dst->ssa.index << 1) | 0; + else { + assert(!dst->reg.indirect); + return (dst->reg.reg->index << 1) | PAN_IS_REG; + } } - - /* MIR manipulation */ void mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new); void mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new); void mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new); -void mir_rewrite_index_dst_single(midgard_instruction *ins, unsigned old, unsigned new); -void mir_rewrite_index_src_single(midgard_instruction *ins, unsigned old, unsigned new); -void mir_rewrite_index_src_swizzle(compiler_context *ctx, unsigned old, unsigned new, unsigned *swizzle); +void mir_rewrite_index_dst_single(midgard_instruction *ins, unsigned old, + unsigned new); +void mir_rewrite_index_src_single(midgard_instruction *ins, unsigned old, + unsigned new); +void mir_rewrite_index_src_swizzle(compiler_context *ctx, unsigned old, + unsigned new, unsigned *swizzle); bool mir_single_use(compiler_context *ctx, unsigned value); unsigned mir_use_count(compiler_context *ctx, unsigned value); -uint16_t mir_bytemask_of_read_components(midgard_instruction *ins, unsigned node); -uint16_t mir_bytemask_of_read_components_index(midgard_instruction *ins, unsigned i); +uint16_t mir_bytemask_of_read_components(midgard_instruction *ins, + unsigned node); +uint16_t mir_bytemask_of_read_components_index(midgard_instruction *ins, + unsigned i); uint16_t mir_from_bytemask(uint16_t bytemask, unsigned bits); uint16_t mir_bytemask(midgard_instruction *ins); uint16_t mir_round_bytemask_up(uint16_t mask, unsigned bits); @@ -513,19 +527,25 @@ void mir_print_instruction(midgard_instruction *ins); void mir_print_bundle(midgard_bundle *ctx); void mir_print_block(midgard_block *block); void mir_print_shader(compiler_context *ctx); -bool mir_nontrivial_mod(midgard_instruction *ins, unsigned i, bool check_swizzle); +bool mir_nontrivial_mod(midgard_instruction *ins, unsigned i, + bool check_swizzle); bool mir_nontrivial_outmod(midgard_instruction *ins); -midgard_instruction *mir_insert_instruction_before_scheduled(compiler_context *ctx, midgard_block *block, midgard_instruction *tag, midgard_instruction ins); -midgard_instruction *mir_insert_instruction_after_scheduled(compiler_context *ctx, midgard_block *block, midgard_instruction *tag, midgard_instruction ins); +midgard_instruction *mir_insert_instruction_before_scheduled( + compiler_context *ctx, midgard_block *block, midgard_instruction *tag, + midgard_instruction ins); +midgard_instruction *mir_insert_instruction_after_scheduled( + compiler_context *ctx, midgard_block *block, midgard_instruction *tag, + midgard_instruction ins); void mir_flip(midgard_instruction *ins); void mir_compute_temp_count(compiler_context *ctx); -#define LDST_GLOBAL (REGISTER_LDST_ZERO << 2) -#define LDST_SHARED ((REGISTER_LDST_LOCAL_STORAGE_PTR << 2) | COMPONENT_Z) +#define LDST_GLOBAL (REGISTER_LDST_ZERO << 2) +#define LDST_SHARED ((REGISTER_LDST_LOCAL_STORAGE_PTR << 2) | COMPONENT_Z) #define LDST_SCRATCH ((REGISTER_LDST_PC_SP << 2) | COMPONENT_Z) -void mir_set_offset(compiler_context *ctx, midgard_instruction *ins, nir_src *offset, unsigned seg); +void mir_set_offset(compiler_context *ctx, midgard_instruction *ins, + nir_src *offset, unsigned seg); void mir_set_ubo_offset(midgard_instruction *ins, nir_src *src, unsigned bias); /* 'Intrinsic' move for aliasing */ @@ -533,93 +553,91 @@ void mir_set_ubo_offset(midgard_instruction *ins, nir_src *src, unsigned bias); static inline midgard_instruction v_mov(unsigned src, unsigned dest) { - midgard_instruction ins = { - .type = TAG_ALU_4, - .mask = 0xF, - .src = { ~0, src, ~0, ~0 }, - .src_types = { 0, nir_type_uint32 }, - .swizzle = SWIZZLE_IDENTITY, - .dest = dest, - .dest_type = nir_type_uint32, - .op = midgard_alu_op_imov, - .outmod = midgard_outmod_keeplo, - }; + midgard_instruction ins = { + .type = TAG_ALU_4, + .mask = 0xF, + .src = {~0, src, ~0, ~0}, + .src_types = {0, nir_type_uint32}, + .swizzle = SWIZZLE_IDENTITY, + .dest = dest, + .dest_type = nir_type_uint32, + .op = midgard_alu_op_imov, + .outmod = midgard_outmod_keeplo, + }; - return ins; + return ins; } /* Broad types of register classes so we can handle special * registers */ -#define REG_CLASS_WORK 0 -#define REG_CLASS_LDST 1 -#define REG_CLASS_TEXR 3 -#define REG_CLASS_TEXW 4 +#define REG_CLASS_WORK 0 +#define REG_CLASS_LDST 1 +#define REG_CLASS_TEXR 3 +#define REG_CLASS_TEXW 4 /* Like a move, but to thread local storage! */ static inline midgard_instruction -v_load_store_scratch( - unsigned srcdest, - unsigned index, - bool is_store, - unsigned mask) +v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store, + unsigned mask) { - /* We index by 32-bit vec4s */ - unsigned byte = (index * 4 * 4); + /* We index by 32-bit vec4s */ + unsigned byte = (index * 4 * 4); - midgard_instruction ins = { - .type = TAG_LOAD_STORE_4, - .mask = mask, - .dest_type = nir_type_uint32, - .dest = ~0, - .src = { ~0, ~0, ~0, ~0 }, - .swizzle = SWIZZLE_IDENTITY_4, - .op = is_store ? midgard_op_st_128 : midgard_op_ld_128, - .load_store = { - /* For register spilling - to thread local storage */ - .arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR, - .arg_comp = COMPONENT_X, - .bitsize_toggle = true, - .index_format = midgard_index_address_u32, - .index_reg = REGISTER_LDST_ZERO, - }, + midgard_instruction ins = { + .type = TAG_LOAD_STORE_4, + .mask = mask, + .dest_type = nir_type_uint32, + .dest = ~0, + .src = {~0, ~0, ~0, ~0}, + .swizzle = SWIZZLE_IDENTITY_4, + .op = is_store ? midgard_op_st_128 : midgard_op_ld_128, + .load_store = + { + /* For register spilling - to thread local storage */ + .arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR, + .arg_comp = COMPONENT_X, + .bitsize_toggle = true, + .index_format = midgard_index_address_u32, + .index_reg = REGISTER_LDST_ZERO, + }, - /* If we spill an unspill, RA goes into an infinite loop */ - .no_spill = (1 << REG_CLASS_WORK), - }; + /* If we spill an unspill, RA goes into an infinite loop */ + .no_spill = (1 << REG_CLASS_WORK), + }; - ins.constants.u32[0] = byte; + ins.constants.u32[0] = byte; - if (is_store) { - ins.src[0] = srcdest; - ins.src_types[0] = nir_type_uint32; + if (is_store) { + ins.src[0] = srcdest; + ins.src_types[0] = nir_type_uint32; - /* Ensure we are tightly swizzled so liveness analysis is - * correct */ + /* Ensure we are tightly swizzled so liveness analysis is + * correct */ - for (unsigned i = 0; i < 4; ++i) { - if (!(mask & (1 << i))) - ins.swizzle[0][i] = COMPONENT_X; - } - } else - ins.dest = srcdest; + for (unsigned i = 0; i < 4; ++i) { + if (!(mask & (1 << i))) + ins.swizzle[0][i] = COMPONENT_X; + } + } else + ins.dest = srcdest; - return ins; + return ins; } static inline bool mir_has_arg(midgard_instruction *ins, unsigned arg) { - if (!ins) - return false; + if (!ins) + return false; - mir_foreach_src(ins, i) { - if (ins->src[i] == arg) - return true; - } + mir_foreach_src(ins, i) { + if (ins->src[i] == arg) + return true; + } - return false; + return false; } /* Scheduling */ @@ -629,19 +647,19 @@ void midgard_schedule_program(compiler_context *ctx); void mir_ra(compiler_context *ctx); void mir_squeeze_index(compiler_context *ctx); void mir_lower_special_reads(compiler_context *ctx); -void mir_liveness_ins_update(uint16_t *live, midgard_instruction *ins, unsigned max); +void mir_liveness_ins_update(uint16_t *live, midgard_instruction *ins, + unsigned max); void mir_compute_liveness(compiler_context *ctx); void mir_invalidate_liveness(compiler_context *ctx); -bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src); +bool mir_is_live_after(compiler_context *ctx, midgard_block *block, + midgard_instruction *start, int src); void mir_create_pipeline_registers(compiler_context *ctx); void midgard_promote_uniforms(compiler_context *ctx); -void -midgard_emit_derivatives(compiler_context *ctx, nir_alu_instr *instr); +void midgard_emit_derivatives(compiler_context *ctx, nir_alu_instr *instr); -void -midgard_lower_derivatives(compiler_context *ctx, midgard_block *block); +void midgard_lower_derivatives(compiler_context *ctx, midgard_block *block); bool mir_op_computes_derivatives(gl_shader_stage stage, unsigned op); @@ -650,25 +668,26 @@ void mir_analyze_helper_requirements(compiler_context *ctx); /* Final emission */ -void emit_binary_bundle( - compiler_context *ctx, - midgard_block *block, - midgard_bundle *bundle, - struct util_dynarray *emission, - int next_tag); +void emit_binary_bundle(compiler_context *ctx, midgard_block *block, + midgard_bundle *bundle, struct util_dynarray *emission, + int next_tag); bool nir_fuse_io_16(nir_shader *shader); bool midgard_nir_lod_errata(nir_shader *shader); -unsigned midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx); +unsigned midgard_get_first_tag_from_block(compiler_context *ctx, + unsigned block_idx); /* Optimizations */ bool midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block); -bool midgard_opt_combine_projection(compiler_context *ctx, midgard_block *block); -bool midgard_opt_varying_projection(compiler_context *ctx, midgard_block *block); +bool midgard_opt_combine_projection(compiler_context *ctx, + midgard_block *block); +bool midgard_opt_varying_projection(compiler_context *ctx, + midgard_block *block); bool midgard_opt_dead_code_eliminate(compiler_context *ctx); -bool midgard_opt_dead_move_eliminate(compiler_context *ctx, midgard_block *block); +bool midgard_opt_dead_move_eliminate(compiler_context *ctx, + midgard_block *block); #endif diff --git a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c index 490834d0c5c..4a2cab60d92 100644 --- a/src/panfrost/midgard/disassemble.c +++ b/src/panfrost/midgard/disassemble.c @@ -25,319 +25,317 @@ * THE SOFTWARE. */ -#include -#include -#include -#include -#include -#include -#include -#include "midgard.h" -#include "midgard_ops.h" -#include "midgard_quirks.h" #include "disassemble.h" -#include "helpers.h" +#include +#include +#include +#include +#include +#include +#include #include "util/bitscan.h" #include "util/half_float.h" #include "util/u_math.h" +#include "helpers.h" +#include "midgard.h" +#include "midgard_ops.h" +#include "midgard_quirks.h" -#define DEFINE_CASE(define, str) case define: { fprintf(fp, str); break; } +#define DEFINE_CASE(define, str) \ + case define: { \ + fprintf(fp, str); \ + break; \ + } /* These are not mapped to hardware values, they just represent the possible * implicit arg modifiers that some midgard opcodes have, which can be decoded * from the opcodes via midgard_{alu,ldst,tex}_special_arg_mod() */ typedef enum { - midgard_arg_mod_none = 0, - midgard_arg_mod_inv, - midgard_arg_mod_x2, + midgard_arg_mod_none = 0, + midgard_arg_mod_inv, + midgard_arg_mod_x2, } midgard_special_arg_mod; typedef struct { - unsigned *midg_tags; + unsigned *midg_tags; - /* For static analysis to ensure all registers are written at least once before - * use along the source code path (TODO: does this break done for complex CF?) - */ + /* For static analysis to ensure all registers are written at least once + * before use along the source code path (TODO: does this break done for + * complex CF?) + */ - uint16_t midg_ever_written; + uint16_t midg_ever_written; } disassemble_context; /* Transform an expanded writemask (duplicated 8-bit format) into its condensed * form (one bit per component) */ static inline unsigned -condense_writemask(unsigned expanded_mask, - unsigned bits_per_component) +condense_writemask(unsigned expanded_mask, unsigned bits_per_component) { - if (bits_per_component == 8) { - /* Duplicate every bit to go from 8 to 16-channel wrmask */ - unsigned omask = 0; + if (bits_per_component == 8) { + /* Duplicate every bit to go from 8 to 16-channel wrmask */ + unsigned omask = 0; - for (unsigned i = 0; i < 8; ++i) { - if (expanded_mask & (1 << i)) - omask |= (3 << (2 * i)); - } + for (unsigned i = 0; i < 8; ++i) { + if (expanded_mask & (1 << i)) + omask |= (3 << (2 * i)); + } - return omask; - } + return omask; + } - unsigned slots_per_component = bits_per_component / 16; - unsigned max_comp = (16 * 8) / bits_per_component; - unsigned condensed_mask = 0; + unsigned slots_per_component = bits_per_component / 16; + unsigned max_comp = (16 * 8) / bits_per_component; + unsigned condensed_mask = 0; - for (unsigned i = 0; i < max_comp; i++) { - if (expanded_mask & (1 << (i * slots_per_component))) - condensed_mask |= (1 << i); - } + for (unsigned i = 0; i < max_comp; i++) { + if (expanded_mask & (1 << (i * slots_per_component))) + condensed_mask |= (1 << i); + } - return condensed_mask; + return condensed_mask; } static bool print_alu_opcode(FILE *fp, midgard_alu_op op) { - if (alu_opcode_props[op].name) - fprintf(fp, "%s", alu_opcode_props[op].name); - else - fprintf(fp, "alu_op_%02X", op); + if (alu_opcode_props[op].name) + fprintf(fp, "%s", alu_opcode_props[op].name); + else + fprintf(fp, "alu_op_%02X", op); - /* For constant analysis */ - return midgard_is_integer_op(op); + /* For constant analysis */ + return midgard_is_integer_op(op); } static void print_ld_st_opcode(FILE *fp, midgard_load_store_op op) { - if (load_store_opcode_props[op].name) - fprintf(fp, "%s", load_store_opcode_props[op].name); - else - fprintf(fp, "ldst_op_%02X", op); + if (load_store_opcode_props[op].name) + fprintf(fp, "%s", load_store_opcode_props[op].name); + else + fprintf(fp, "ldst_op_%02X", op); } static void -validate_sampler_type(enum mali_texture_op op, enum mali_sampler_type sampler_type) +validate_sampler_type(enum mali_texture_op op, + enum mali_sampler_type sampler_type) { - if (op == midgard_tex_op_mov || op == midgard_tex_op_barrier) - assert(sampler_type == 0); - else - assert(sampler_type > 0); + if (op == midgard_tex_op_mov || op == midgard_tex_op_barrier) + assert(sampler_type == 0); + else + assert(sampler_type > 0); } static void validate_expand_mode(midgard_src_expand_mode expand_mode, midgard_reg_mode reg_mode) { - switch (expand_mode) { - case midgard_src_passthrough: - break; + switch (expand_mode) { + case midgard_src_passthrough: + break; - case midgard_src_rep_low: - assert(reg_mode == midgard_reg_mode_8 || - reg_mode == midgard_reg_mode_16); - break; + case midgard_src_rep_low: + assert(reg_mode == midgard_reg_mode_8 || reg_mode == midgard_reg_mode_16); + break; - case midgard_src_rep_high: - assert(reg_mode == midgard_reg_mode_8 || - reg_mode == midgard_reg_mode_16); - break; + case midgard_src_rep_high: + assert(reg_mode == midgard_reg_mode_8 || reg_mode == midgard_reg_mode_16); + break; - case midgard_src_swap: - assert(reg_mode == midgard_reg_mode_8 || - reg_mode == midgard_reg_mode_16); - break; + case midgard_src_swap: + assert(reg_mode == midgard_reg_mode_8 || reg_mode == midgard_reg_mode_16); + break; - case midgard_src_expand_low: - assert(reg_mode != midgard_reg_mode_8); - break; + case midgard_src_expand_low: + assert(reg_mode != midgard_reg_mode_8); + break; - case midgard_src_expand_high: - assert(reg_mode != midgard_reg_mode_8); - break; + case midgard_src_expand_high: + assert(reg_mode != midgard_reg_mode_8); + break; - case midgard_src_expand_low_swap: - assert(reg_mode == midgard_reg_mode_16); - break; + case midgard_src_expand_low_swap: + assert(reg_mode == midgard_reg_mode_16); + break; - case midgard_src_expand_high_swap: - assert(reg_mode == midgard_reg_mode_16); - break; + case midgard_src_expand_high_swap: + assert(reg_mode == midgard_reg_mode_16); + break; - default: - unreachable("Invalid expand mode"); - break; - } + default: + unreachable("Invalid expand mode"); + break; + } } static void print_alu_reg(disassemble_context *ctx, FILE *fp, unsigned reg, bool is_write) { - unsigned uniform_reg = 23 - reg; - bool is_uniform = false; + unsigned uniform_reg = 23 - reg; + bool is_uniform = false; - /* For r8-r15, it could be a work or uniform. We distinguish based on - * the fact work registers are ALWAYS written before use, but uniform - * registers are NEVER written before use. */ + /* For r8-r15, it could be a work or uniform. We distinguish based on + * the fact work registers are ALWAYS written before use, but uniform + * registers are NEVER written before use. */ - if ((reg >= 8 && reg < 16) && !(ctx->midg_ever_written & (1 << reg))) - is_uniform = true; + if ((reg >= 8 && reg < 16) && !(ctx->midg_ever_written & (1 << reg))) + is_uniform = true; - /* r16-r23 are always uniform */ + /* r16-r23 are always uniform */ - if (reg >= 16 && reg <= 23) - is_uniform = true; + if (reg >= 16 && reg <= 23) + is_uniform = true; - if (reg == REGISTER_UNUSED || reg == REGISTER_UNUSED + 1) - fprintf(fp, "TMP%u", reg - REGISTER_UNUSED); - else if (reg == REGISTER_TEXTURE_BASE || reg == REGISTER_TEXTURE_BASE + 1) - fprintf(fp, "%s%u", is_write ? "AT" : "TA", reg - REGISTER_TEXTURE_BASE); - else if (reg == REGISTER_LDST_BASE || reg == REGISTER_LDST_BASE + 1) - fprintf(fp, "AL%u", reg - REGISTER_LDST_BASE); - else if (is_uniform) - fprintf(fp, "U%u", uniform_reg); - else if (reg == 31 && !is_write) - fprintf(fp, "PC_SP"); - else - fprintf(fp, "R%u", reg); + if (reg == REGISTER_UNUSED || reg == REGISTER_UNUSED + 1) + fprintf(fp, "TMP%u", reg - REGISTER_UNUSED); + else if (reg == REGISTER_TEXTURE_BASE || reg == REGISTER_TEXTURE_BASE + 1) + fprintf(fp, "%s%u", is_write ? "AT" : "TA", reg - REGISTER_TEXTURE_BASE); + else if (reg == REGISTER_LDST_BASE || reg == REGISTER_LDST_BASE + 1) + fprintf(fp, "AL%u", reg - REGISTER_LDST_BASE); + else if (is_uniform) + fprintf(fp, "U%u", uniform_reg); + else if (reg == 31 && !is_write) + fprintf(fp, "PC_SP"); + else + fprintf(fp, "R%u", reg); } static void print_ldst_write_reg(FILE *fp, unsigned reg) { - switch (reg) { - case 26: - case 27: - fprintf(fp, "AL%u", reg - REGISTER_LDST_BASE); - break; - case 28: - case 29: - fprintf(fp, "AT%u", reg - REGISTER_TEXTURE_BASE); - break; - case 31: - fprintf(fp, "PC_SP"); - break; - default: - fprintf(fp, "R%d", reg); - break; - } + switch (reg) { + case 26: + case 27: + fprintf(fp, "AL%u", reg - REGISTER_LDST_BASE); + break; + case 28: + case 29: + fprintf(fp, "AT%u", reg - REGISTER_TEXTURE_BASE); + break; + case 31: + fprintf(fp, "PC_SP"); + break; + default: + fprintf(fp, "R%d", reg); + break; + } } static void print_ldst_read_reg(FILE *fp, unsigned reg) { - switch (reg) { - case 0: - case 1: - fprintf(fp, "AL%u", reg); - break; - case 2: - fprintf(fp, "PC_SP"); - break; - case 3: - fprintf(fp, "LOCAL_STORAGE_PTR"); - break; - case 4: - fprintf(fp, "LOCAL_THREAD_ID"); - break; - case 5: - fprintf(fp, "GROUP_ID"); - break; - case 6: - fprintf(fp, "GLOBAL_THREAD_ID"); - break; - case 7: - fprintf(fp, "0"); - break; - default: - unreachable("Invalid load/store register read"); - } + switch (reg) { + case 0: + case 1: + fprintf(fp, "AL%u", reg); + break; + case 2: + fprintf(fp, "PC_SP"); + break; + case 3: + fprintf(fp, "LOCAL_STORAGE_PTR"); + break; + case 4: + fprintf(fp, "LOCAL_THREAD_ID"); + break; + case 5: + fprintf(fp, "GROUP_ID"); + break; + case 6: + fprintf(fp, "GLOBAL_THREAD_ID"); + break; + case 7: + fprintf(fp, "0"); + break; + default: + unreachable("Invalid load/store register read"); + } } static void print_tex_reg(FILE *fp, unsigned reg, bool is_write) { - char *str = is_write ? "TA" : "AT"; - int select = reg & 1; + char *str = is_write ? "TA" : "AT"; + int select = reg & 1; - switch (reg) { - case 0: - case 1: - fprintf(fp, "R%d", select); - break; - case 26: - case 27: - fprintf(fp, "AL%d", select); - break; - case 28: - case 29: - fprintf(fp, "%s%d", str, select); - break; - default: - unreachable("Invalid texture register"); - } + switch (reg) { + case 0: + case 1: + fprintf(fp, "R%d", select); + break; + case 26: + case 27: + fprintf(fp, "AL%d", select); + break; + case 28: + case 29: + fprintf(fp, "%s%d", str, select); + break; + default: + unreachable("Invalid texture register"); + } } - static char *srcmod_names_int[4] = { - ".sext", - ".zext", - ".replicate", - ".lshift", + ".sext", + ".zext", + ".replicate", + ".lshift", }; static char *argmod_names[3] = { - "", - ".inv", - ".x2", + "", + ".inv", + ".x2", }; -static char *index_format_names[4] = { - "", - ".u64", - ".u32", - ".s32" -}; +static char *index_format_names[4] = {"", ".u64", ".u32", ".s32"}; static void print_alu_outmod(FILE *fp, unsigned outmod, bool is_int, bool half) { - if (is_int && !half) { - assert(outmod == midgard_outmod_keeplo); - return; - } + if (is_int && !half) { + assert(outmod == midgard_outmod_keeplo); + return; + } - if (!is_int && half) - fprintf(fp, ".shrink"); + if (!is_int && half) + fprintf(fp, ".shrink"); - mir_print_outmod(fp, outmod, is_int); + mir_print_outmod(fp, outmod, is_int); } /* arg == 0 (dest), arg == 1 (src1), arg == 2 (src2) */ static midgard_special_arg_mod -midgard_alu_special_arg_mod(midgard_alu_op op, unsigned arg) { - midgard_special_arg_mod mod = midgard_arg_mod_none; +midgard_alu_special_arg_mod(midgard_alu_op op, unsigned arg) +{ + midgard_special_arg_mod mod = midgard_arg_mod_none; - switch (op) { - case midgard_alu_op_ishladd: - case midgard_alu_op_ishlsub: - if (arg == 1) mod = midgard_arg_mod_x2; - break; + switch (op) { + case midgard_alu_op_ishladd: + case midgard_alu_op_ishlsub: + if (arg == 1) + mod = midgard_arg_mod_x2; + break; - default: - break; - } + default: + break; + } - return mod; + return mod; } static void print_quad_word(FILE *fp, uint32_t *words, unsigned tabs) { - unsigned i; + unsigned i; - for (i = 0; i < 4; i++) - fprintf(fp, "0x%08X%s ", words[i], i == 3 ? "" : ","); + for (i = 0; i < 4; i++) + fprintf(fp, "0x%08X%s ", words[i], i == 3 ? "" : ","); - fprintf(fp, "\n"); + fprintf(fp, "\n"); } static const char components[16] = "xyzwefghijklmnop"; @@ -345,348 +343,346 @@ static const char components[16] = "xyzwefghijklmnop"; static int bits_for_mode(midgard_reg_mode mode) { - switch (mode) { - case midgard_reg_mode_8: - return 8; - case midgard_reg_mode_16: - return 16; - case midgard_reg_mode_32: - return 32; - case midgard_reg_mode_64: - return 64; - default: - unreachable("Invalid reg mode"); - return 0; - } + switch (mode) { + case midgard_reg_mode_8: + return 8; + case midgard_reg_mode_16: + return 16; + case midgard_reg_mode_32: + return 32; + case midgard_reg_mode_64: + return 64; + default: + unreachable("Invalid reg mode"); + return 0; + } } static int bits_for_mode_halved(midgard_reg_mode mode, bool half) { - unsigned bits = bits_for_mode(mode); + unsigned bits = bits_for_mode(mode); - if (half) - bits >>= 1; + if (half) + bits >>= 1; - return bits; + return bits; } static void -print_vec_selectors_64(FILE *fp, unsigned swizzle, - midgard_reg_mode reg_mode, +print_vec_selectors_64(FILE *fp, unsigned swizzle, midgard_reg_mode reg_mode, midgard_src_expand_mode expand_mode, unsigned selector_offset, uint8_t mask) { - bool expands = INPUT_EXPANDS(expand_mode); + bool expands = INPUT_EXPANDS(expand_mode); - unsigned comp_skip = expands ? 1 : 2; - unsigned mask_bit = 0; - for (unsigned i = selector_offset; i < 4; i += comp_skip, mask_bit += 4) { - if (!(mask & (1 << mask_bit))) continue; + unsigned comp_skip = expands ? 1 : 2; + unsigned mask_bit = 0; + for (unsigned i = selector_offset; i < 4; i += comp_skip, mask_bit += 4) { + if (!(mask & (1 << mask_bit))) + continue; - unsigned a = (swizzle >> (i * 2)) & 3; + unsigned a = (swizzle >> (i * 2)) & 3; - if (INPUT_EXPANDS(expand_mode)) { - if (expand_mode == midgard_src_expand_high) - a += 2; + if (INPUT_EXPANDS(expand_mode)) { + if (expand_mode == midgard_src_expand_high) + a += 2; - fprintf(fp, "%c", components[a / 2]); - continue; - } + fprintf(fp, "%c", components[a / 2]); + continue; + } - unsigned b = (swizzle >> ((i+1) * 2)) & 3; + unsigned b = (swizzle >> ((i + 1) * 2)) & 3; - /* Normally we're adjacent, but if there's an issue, - * don't make it ambiguous */ + /* Normally we're adjacent, but if there's an issue, + * don't make it ambiguous */ - if (b == a + 1) - fprintf(fp, "%c", a >> 1 ? 'Y' : 'X'); - else - fprintf(fp, "[%c%c]", components[a], components[b]); - } + if (b == a + 1) + fprintf(fp, "%c", a >> 1 ? 'Y' : 'X'); + else + fprintf(fp, "[%c%c]", components[a], components[b]); + } } static void -print_vec_selectors(FILE *fp, unsigned swizzle, - midgard_reg_mode reg_mode, +print_vec_selectors(FILE *fp, unsigned swizzle, midgard_reg_mode reg_mode, unsigned selector_offset, uint8_t mask, unsigned *mask_offset) { - assert(reg_mode != midgard_reg_mode_64); + assert(reg_mode != midgard_reg_mode_64); - unsigned mask_skip = MAX2(bits_for_mode(reg_mode) / 16, 1); + unsigned mask_skip = MAX2(bits_for_mode(reg_mode) / 16, 1); - bool is_vec16 = reg_mode == midgard_reg_mode_8; + bool is_vec16 = reg_mode == midgard_reg_mode_8; - for (unsigned i = 0; i < 4; i++, *mask_offset += mask_skip) { - if (!(mask & (1 << *mask_offset))) continue; + for (unsigned i = 0; i < 4; i++, *mask_offset += mask_skip) { + if (!(mask & (1 << *mask_offset))) + continue; - unsigned c = (swizzle >> (i * 2)) & 3; + unsigned c = (swizzle >> (i * 2)) & 3; - /* Vec16 has two components per swizzle selector. */ - if (is_vec16) - c *= 2; + /* Vec16 has two components per swizzle selector. */ + if (is_vec16) + c *= 2; - c += selector_offset; + c += selector_offset; - fprintf(fp, "%c", components[c]); - if (is_vec16) - fprintf(fp, "%c", components[c+1]); - } + fprintf(fp, "%c", components[c]); + if (is_vec16) + fprintf(fp, "%c", components[c + 1]); + } } static void -print_vec_swizzle(FILE *fp, unsigned swizzle, - midgard_src_expand_mode expand, - midgard_reg_mode mode, - uint8_t mask) +print_vec_swizzle(FILE *fp, unsigned swizzle, midgard_src_expand_mode expand, + midgard_reg_mode mode, uint8_t mask) { - unsigned bits = bits_for_mode_halved(mode, INPUT_EXPANDS(expand)); + unsigned bits = bits_for_mode_halved(mode, INPUT_EXPANDS(expand)); - /* Swizzle selectors are divided in two halves that are always - * mirrored, the only difference is the starting component offset. - * The number represents an offset into the components[] array. */ - unsigned first_half = 0; - unsigned second_half = (128 / bits) / 2; /* only used for 8 and 16-bit */ + /* Swizzle selectors are divided in two halves that are always + * mirrored, the only difference is the starting component offset. + * The number represents an offset into the components[] array. */ + unsigned first_half = 0; + unsigned second_half = (128 / bits) / 2; /* only used for 8 and 16-bit */ - switch (expand) { - case midgard_src_passthrough: - if (swizzle == 0xE4) return; /* identity swizzle */ - break; + switch (expand) { + case midgard_src_passthrough: + if (swizzle == 0xE4) + return; /* identity swizzle */ + break; - case midgard_src_expand_low: - second_half /= 2; - break; + case midgard_src_expand_low: + second_half /= 2; + break; - case midgard_src_expand_high: - first_half = second_half; - second_half += second_half / 2; - break; + case midgard_src_expand_high: + first_half = second_half; + second_half += second_half / 2; + break; - /* The rest of the cases are only used for 8 and 16-bit */ + /* The rest of the cases are only used for 8 and 16-bit */ - case midgard_src_rep_low: - second_half = 0; - break; + case midgard_src_rep_low: + second_half = 0; + break; - case midgard_src_rep_high: - first_half = second_half; - break; + case midgard_src_rep_high: + first_half = second_half; + break; - case midgard_src_swap: - first_half = second_half; - second_half = 0; - break; + case midgard_src_swap: + first_half = second_half; + second_half = 0; + break; - case midgard_src_expand_low_swap: - first_half = second_half / 2; - second_half = 0; - break; + case midgard_src_expand_low_swap: + first_half = second_half / 2; + second_half = 0; + break; - case midgard_src_expand_high_swap: - first_half = second_half + second_half / 2; - break; + case midgard_src_expand_high_swap: + first_half = second_half + second_half / 2; + break; - default: - unreachable("Invalid expand mode"); - break; - } + default: + unreachable("Invalid expand mode"); + break; + } - fprintf(fp, "."); + fprintf(fp, "."); - /* Vec2 are weird so we use a separate function to simplify things. */ - if (mode == midgard_reg_mode_64) { - print_vec_selectors_64(fp, swizzle, mode, expand, first_half, mask); - return; - } + /* Vec2 are weird so we use a separate function to simplify things. */ + if (mode == midgard_reg_mode_64) { + print_vec_selectors_64(fp, swizzle, mode, expand, first_half, mask); + return; + } - unsigned mask_offs = 0; - print_vec_selectors(fp, swizzle, mode, first_half, mask, &mask_offs); - if (mode == midgard_reg_mode_8 || mode == midgard_reg_mode_16) - print_vec_selectors(fp, swizzle, mode, second_half, mask, &mask_offs); + unsigned mask_offs = 0; + print_vec_selectors(fp, swizzle, mode, first_half, mask, &mask_offs); + if (mode == midgard_reg_mode_8 || mode == midgard_reg_mode_16) + print_vec_selectors(fp, swizzle, mode, second_half, mask, &mask_offs); } static void print_scalar_constant(FILE *fp, unsigned src_binary, - const midgard_constants *consts, - midgard_scalar_alu *alu) + const midgard_constants *consts, midgard_scalar_alu *alu) { - midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary; - assert(consts != NULL); + midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary; + assert(consts != NULL); - fprintf(fp, "#"); - mir_print_constant_component(fp, consts, src->component, - src->full ? - midgard_reg_mode_32 : midgard_reg_mode_16, - false, src->mod, alu->op); + fprintf(fp, "#"); + mir_print_constant_component( + fp, consts, src->component, + src->full ? midgard_reg_mode_32 : midgard_reg_mode_16, false, src->mod, + alu->op); } static void print_vector_constants(FILE *fp, unsigned src_binary, - const midgard_constants *consts, - midgard_vector_alu *alu) + const midgard_constants *consts, midgard_vector_alu *alu) { - midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary; - bool expands = INPUT_EXPANDS(src->expand_mode); - unsigned bits = bits_for_mode_halved(alu->reg_mode, expands); - unsigned max_comp = (sizeof(*consts) * 8) / bits; - unsigned comp_mask, num_comp = 0; + midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary; + bool expands = INPUT_EXPANDS(src->expand_mode); + unsigned bits = bits_for_mode_halved(alu->reg_mode, expands); + unsigned max_comp = (sizeof(*consts) * 8) / bits; + unsigned comp_mask, num_comp = 0; - assert(consts); - assert(max_comp <= 16); + assert(consts); + assert(max_comp <= 16); - comp_mask = effective_writemask(alu->op, condense_writemask(alu->mask, bits)); - num_comp = util_bitcount(comp_mask); + comp_mask = + effective_writemask(alu->op, condense_writemask(alu->mask, bits)); + num_comp = util_bitcount(comp_mask); - if (num_comp > 1) - fprintf(fp, "<"); - else - fprintf(fp, "#"); + if (num_comp > 1) + fprintf(fp, "<"); + else + fprintf(fp, "#"); - bool first = true; + bool first = true; - for (unsigned i = 0; i < max_comp; ++i) { - if (!(comp_mask & (1 << i))) continue; + for (unsigned i = 0; i < max_comp; ++i) { + if (!(comp_mask & (1 << i))) + continue; - unsigned c = (src->swizzle >> (i * 2)) & 3; + unsigned c = (src->swizzle >> (i * 2)) & 3; - if (bits == 16 && !expands) { - bool upper = i >= 4; + if (bits == 16 && !expands) { + bool upper = i >= 4; - switch (src->expand_mode) { - case midgard_src_passthrough: - c += upper * 4; - break; - case midgard_src_rep_low: - break; - case midgard_src_rep_high: - c += 4; - break; - case midgard_src_swap: - c += !upper * 4; - break; - default: - unreachable("invalid expand mode"); - break; - } - } else if (bits == 32 && !expands) { - /* Implicitly ok */ - } else if (bits == 64 && !expands) { - /* Implicitly ok */ - } else if (bits == 8 && !expands) { - bool upper = i >= 8; + switch (src->expand_mode) { + case midgard_src_passthrough: + c += upper * 4; + break; + case midgard_src_rep_low: + break; + case midgard_src_rep_high: + c += 4; + break; + case midgard_src_swap: + c += !upper * 4; + break; + default: + unreachable("invalid expand mode"); + break; + } + } else if (bits == 32 && !expands) { + /* Implicitly ok */ + } else if (bits == 64 && !expands) { + /* Implicitly ok */ + } else if (bits == 8 && !expands) { + bool upper = i >= 8; - unsigned index = (i >> 1) & 3; - unsigned base = (src->swizzle >> (index * 2)) & 3; - c = base * 2; + unsigned index = (i >> 1) & 3; + unsigned base = (src->swizzle >> (index * 2)) & 3; + c = base * 2; - switch (src->expand_mode) { - case midgard_src_passthrough: - c += upper * 8; - break; - case midgard_src_rep_low: - break; - case midgard_src_rep_high: - c += 8; - break; - case midgard_src_swap: - c += !upper * 8; - break; - default: - unreachable("invalid expand mode"); - break; - } + switch (src->expand_mode) { + case midgard_src_passthrough: + c += upper * 8; + break; + case midgard_src_rep_low: + break; + case midgard_src_rep_high: + c += 8; + break; + case midgard_src_swap: + c += !upper * 8; + break; + default: + unreachable("invalid expand mode"); + break; + } - /* We work on twos, actually */ - if (i & 1) - c++; - } + /* We work on twos, actually */ + if (i & 1) + c++; + } - if (first) - first = false; - else - fprintf(fp, ", "); + if (first) + first = false; + else + fprintf(fp, ", "); - mir_print_constant_component(fp, consts, c, alu->reg_mode, - expands, src->mod, alu->op); - } + mir_print_constant_component(fp, consts, c, alu->reg_mode, expands, + src->mod, alu->op); + } - if (num_comp > 1) - fprintf(fp, ">"); + if (num_comp > 1) + fprintf(fp, ">"); } static void print_srcmod(FILE *fp, bool is_int, bool expands, unsigned mod, bool scalar) { - /* Modifiers change meaning depending on the op's context */ + /* Modifiers change meaning depending on the op's context */ - if (is_int) { - if (expands) - fprintf(fp, "%s", srcmod_names_int[mod]); - } else { - if (mod & MIDGARD_FLOAT_MOD_ABS) - fprintf(fp, ".abs"); - if (mod & MIDGARD_FLOAT_MOD_NEG) - fprintf(fp, ".neg"); - if (expands) - fprintf(fp, ".widen"); - } + if (is_int) { + if (expands) + fprintf(fp, "%s", srcmod_names_int[mod]); + } else { + if (mod & MIDGARD_FLOAT_MOD_ABS) + fprintf(fp, ".abs"); + if (mod & MIDGARD_FLOAT_MOD_NEG) + fprintf(fp, ".neg"); + if (expands) + fprintf(fp, ".widen"); + } } static void print_vector_src(disassemble_context *ctx, FILE *fp, unsigned src_binary, midgard_reg_mode mode, unsigned reg, - midgard_shrink_mode shrink_mode, - uint8_t src_mask, bool is_int, + midgard_shrink_mode shrink_mode, uint8_t src_mask, bool is_int, midgard_special_arg_mod arg_mod) { - midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary; + midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary; - validate_expand_mode(src->expand_mode, mode); + validate_expand_mode(src->expand_mode, mode); - print_alu_reg(ctx, fp, reg, false); + print_alu_reg(ctx, fp, reg, false); - print_vec_swizzle(fp, src->swizzle, src->expand_mode, mode, src_mask); + print_vec_swizzle(fp, src->swizzle, src->expand_mode, mode, src_mask); - fprintf(fp, "%s", argmod_names[arg_mod]); + fprintf(fp, "%s", argmod_names[arg_mod]); - print_srcmod(fp, is_int, INPUT_EXPANDS(src->expand_mode), src->mod, false); + print_srcmod(fp, is_int, INPUT_EXPANDS(src->expand_mode), src->mod, false); } static uint16_t decode_vector_imm(unsigned src2_reg, unsigned imm) { - uint16_t ret; - ret = src2_reg << 11; - ret |= (imm & 0x7) << 8; - ret |= (imm >> 3) & 0xFF; - return ret; + uint16_t ret; + ret = src2_reg << 11; + ret |= (imm & 0x7) << 8; + ret |= (imm >> 3) & 0xFF; + return ret; } static void print_immediate(FILE *fp, uint16_t imm, bool is_instruction_int) { - if (is_instruction_int) - fprintf(fp, "#%u", imm); - else - fprintf(fp, "#%g", _mesa_half_to_float(imm)); + if (is_instruction_int) + fprintf(fp, "#%u", imm); + else + fprintf(fp, "#%g", _mesa_half_to_float(imm)); } static void update_dest(disassemble_context *ctx, unsigned reg) { - /* We should record writes as marking this as a work register. Store - * the max register in work_count; we'll add one at the end */ + /* We should record writes as marking this as a work register. Store + * the max register in work_count; we'll add one at the end */ - if (reg < 16) - ctx->midg_ever_written |= (1 << reg); + if (reg < 16) + ctx->midg_ever_written |= (1 << reg); } static void print_dest(disassemble_context *ctx, FILE *fp, unsigned reg) { - update_dest(ctx, reg); - print_alu_reg(ctx, fp, reg, true); + update_dest(ctx, reg); + print_alu_reg(ctx, fp, reg, true); } /* For 16-bit+ masks, we read off from the 8-bit mask field. For 16-bit (vec8), @@ -697,84 +693,86 @@ print_dest(disassemble_context *ctx, FILE *fp, unsigned reg) * the mask to make it obvious what happened */ static void -print_alu_mask(FILE *fp, uint8_t mask, unsigned bits, midgard_shrink_mode shrink_mode) +print_alu_mask(FILE *fp, uint8_t mask, unsigned bits, + midgard_shrink_mode shrink_mode) { - /* Skip 'complete' masks */ + /* Skip 'complete' masks */ - if (shrink_mode == midgard_shrink_mode_none && mask == 0xFF) - return; + if (shrink_mode == midgard_shrink_mode_none && mask == 0xFF) + return; - fprintf(fp, "."); + fprintf(fp, "."); - unsigned skip = MAX2(bits / 16, 1); - bool tripped = false; + unsigned skip = MAX2(bits / 16, 1); + bool tripped = false; - /* To apply an upper destination shrink_mode, we "shift" the alphabet. - * E.g. with an upper shrink_mode on 32-bit, instead of xyzw, print efgh. - * For upper 16-bit, instead of xyzwefgh, print ijklmnop */ + /* To apply an upper destination shrink_mode, we "shift" the alphabet. + * E.g. with an upper shrink_mode on 32-bit, instead of xyzw, print efgh. + * For upper 16-bit, instead of xyzwefgh, print ijklmnop */ - const char *alphabet = components; + const char *alphabet = components; - if (shrink_mode == midgard_shrink_mode_upper) { - assert(bits != 8); - alphabet += (128 / bits); - } + if (shrink_mode == midgard_shrink_mode_upper) { + assert(bits != 8); + alphabet += (128 / bits); + } - for (unsigned i = 0; i < 8; i += skip) { - bool a = (mask & (1 << i)) != 0; + for (unsigned i = 0; i < 8; i += skip) { + bool a = (mask & (1 << i)) != 0; - for (unsigned j = 1; j < skip; ++j) { - bool dupe = (mask & (1 << (i + j))) != 0; - tripped |= (dupe != a); - } + for (unsigned j = 1; j < skip; ++j) { + bool dupe = (mask & (1 << (i + j))) != 0; + tripped |= (dupe != a); + } - if (a) { - /* TODO: handle shrinking from 16-bit */ - unsigned comp_idx = bits == 8 ? i * 2 : i; - char c = alphabet[comp_idx / skip]; + if (a) { + /* TODO: handle shrinking from 16-bit */ + unsigned comp_idx = bits == 8 ? i * 2 : i; + char c = alphabet[comp_idx / skip]; - fprintf(fp, "%c", c); - if (bits == 8) - fprintf(fp, "%c", alphabet[comp_idx+1]); - } - } + fprintf(fp, "%c", c); + if (bits == 8) + fprintf(fp, "%c", alphabet[comp_idx + 1]); + } + } - if (tripped) - fprintf(fp, " /* %X */", mask); + if (tripped) + fprintf(fp, " /* %X */", mask); } /* TODO: 16-bit mode */ static void -print_ldst_mask(FILE *fp, unsigned mask, unsigned swizzle) { - fprintf(fp, "."); +print_ldst_mask(FILE *fp, unsigned mask, unsigned swizzle) +{ + fprintf(fp, "."); - for (unsigned i = 0; i < 4; ++i) { - bool write = (mask & (1 << i)) != 0; - unsigned c = (swizzle >> (i * 2)) & 3; - /* We can't omit the swizzle here since many ldst ops have a - * combined swizzle/writemask, and it would be ambiguous to not - * print the masked-out components. */ - fprintf(fp, "%c", write ? components[c] : '~'); - } + for (unsigned i = 0; i < 4; ++i) { + bool write = (mask & (1 << i)) != 0; + unsigned c = (swizzle >> (i * 2)) & 3; + /* We can't omit the swizzle here since many ldst ops have a + * combined swizzle/writemask, and it would be ambiguous to not + * print the masked-out components. */ + fprintf(fp, "%c", write ? components[c] : '~'); + } } static void print_tex_mask(FILE *fp, unsigned mask, bool upper) { - if (mask == 0xF) { - if (upper) - fprintf(fp, "'"); + if (mask == 0xF) { + if (upper) + fprintf(fp, "'"); - return; - } + return; + } - fprintf(fp, "."); + fprintf(fp, "."); - for (unsigned i = 0; i < 4; ++i) { - bool a = (mask & (1 << i)) != 0; - if (a) - fprintf(fp, "%c", components[i + (upper ? 4 : 0)]); - } + for (unsigned i = 0; i < 4; ++i) { + bool a = (mask & (1 << i)) != 0; + if (a) + fprintf(fp, "%c", components[i + (upper ? 4 : 0)]); + } } static void @@ -782,115 +780,120 @@ print_vector_field(disassemble_context *ctx, FILE *fp, const char *name, uint16_t *words, uint16_t reg_word, const midgard_constants *consts, unsigned tabs, bool verbose) { - midgard_reg_info *reg_info = (midgard_reg_info *)®_word; - midgard_vector_alu *alu_field = (midgard_vector_alu *) words; - midgard_reg_mode mode = alu_field->reg_mode; - midgard_alu_op op = alu_field->op; - unsigned shrink_mode = alu_field->shrink_mode; - bool is_int = midgard_is_integer_op(op); - bool is_int_out = midgard_is_integer_out_op(op); + midgard_reg_info *reg_info = (midgard_reg_info *)®_word; + midgard_vector_alu *alu_field = (midgard_vector_alu *)words; + midgard_reg_mode mode = alu_field->reg_mode; + midgard_alu_op op = alu_field->op; + unsigned shrink_mode = alu_field->shrink_mode; + bool is_int = midgard_is_integer_op(op); + bool is_int_out = midgard_is_integer_out_op(op); - if (verbose) - fprintf(fp, "%s.", name); + if (verbose) + fprintf(fp, "%s.", name); - bool is_instruction_int = print_alu_opcode(fp, alu_field->op); + bool is_instruction_int = print_alu_opcode(fp, alu_field->op); - /* Print lane width */ - fprintf(fp, ".%c%d", is_int_out ? 'i' : 'f', bits_for_mode(mode)); + /* Print lane width */ + fprintf(fp, ".%c%d", is_int_out ? 'i' : 'f', bits_for_mode(mode)); - fprintf(fp, " "); + fprintf(fp, " "); - /* Mask denoting status of 8-lanes */ - uint8_t mask = alu_field->mask; + /* Mask denoting status of 8-lanes */ + uint8_t mask = alu_field->mask; - /* First, print the destination */ - print_dest(ctx, fp, reg_info->out_reg); + /* First, print the destination */ + print_dest(ctx, fp, reg_info->out_reg); - if (shrink_mode != midgard_shrink_mode_none) { - bool shrinkable = (mode != midgard_reg_mode_8); - bool known = shrink_mode != 0x3; /* Unused value */ + if (shrink_mode != midgard_shrink_mode_none) { + bool shrinkable = (mode != midgard_reg_mode_8); + bool known = shrink_mode != 0x3; /* Unused value */ - if (!(shrinkable && known)) - fprintf(fp, "/* do%u */ ", shrink_mode); - } + if (!(shrinkable && known)) + fprintf(fp, "/* do%u */ ", shrink_mode); + } - /* Instructions like fdot4 do *not* replicate, ensure the - * mask is of only a single component */ + /* Instructions like fdot4 do *not* replicate, ensure the + * mask is of only a single component */ - unsigned rep = GET_CHANNEL_COUNT(alu_opcode_props[op].props); + unsigned rep = GET_CHANNEL_COUNT(alu_opcode_props[op].props); - if (rep) { - unsigned comp_mask = condense_writemask(mask, bits_for_mode(mode)); - unsigned num_comp = util_bitcount(comp_mask); - if (num_comp != 1) - fprintf(fp, "/* err too many components */"); - } - print_alu_mask(fp, mask, bits_for_mode(mode), shrink_mode); + if (rep) { + unsigned comp_mask = condense_writemask(mask, bits_for_mode(mode)); + unsigned num_comp = util_bitcount(comp_mask); + if (num_comp != 1) + fprintf(fp, "/* err too many components */"); + } + print_alu_mask(fp, mask, bits_for_mode(mode), shrink_mode); - /* Print output modifiers */ + /* Print output modifiers */ - print_alu_outmod(fp, alu_field->outmod, is_int_out, shrink_mode != midgard_shrink_mode_none); + print_alu_outmod(fp, alu_field->outmod, is_int_out, + shrink_mode != midgard_shrink_mode_none); - /* Mask out unused components based on the writemask, but don't mask out - * components that are used for interlane instructions like fdot3. */ - uint8_t src_mask = - rep ? expand_writemask(mask_of(rep), util_logbase2(128 / bits_for_mode(mode))) : mask; + /* Mask out unused components based on the writemask, but don't mask out + * components that are used for interlane instructions like fdot3. */ + uint8_t src_mask = + rep ? expand_writemask(mask_of(rep), + util_logbase2(128 / bits_for_mode(mode))) + : mask; - fprintf(fp, ", "); + fprintf(fp, ", "); - if (reg_info->src1_reg == REGISTER_CONSTANT) - print_vector_constants(fp, alu_field->src1, consts, alu_field); - else { - midgard_special_arg_mod argmod = midgard_alu_special_arg_mod(op, 1); - print_vector_src(ctx, fp, alu_field->src1, mode, reg_info->src1_reg, - shrink_mode, src_mask, is_int, argmod); - } + if (reg_info->src1_reg == REGISTER_CONSTANT) + print_vector_constants(fp, alu_field->src1, consts, alu_field); + else { + midgard_special_arg_mod argmod = midgard_alu_special_arg_mod(op, 1); + print_vector_src(ctx, fp, alu_field->src1, mode, reg_info->src1_reg, + shrink_mode, src_mask, is_int, argmod); + } - fprintf(fp, ", "); + fprintf(fp, ", "); - if (reg_info->src2_imm) { - uint16_t imm = decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2); - print_immediate(fp, imm, is_instruction_int); - } else if (reg_info->src2_reg == REGISTER_CONSTANT) { - print_vector_constants(fp, alu_field->src2, consts, alu_field); - } else { - midgard_special_arg_mod argmod = midgard_alu_special_arg_mod(op, 2); - print_vector_src(ctx, fp, alu_field->src2, mode, reg_info->src2_reg, - shrink_mode, src_mask, is_int, argmod); - } + if (reg_info->src2_imm) { + uint16_t imm = + decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2); + print_immediate(fp, imm, is_instruction_int); + } else if (reg_info->src2_reg == REGISTER_CONSTANT) { + print_vector_constants(fp, alu_field->src2, consts, alu_field); + } else { + midgard_special_arg_mod argmod = midgard_alu_special_arg_mod(op, 2); + print_vector_src(ctx, fp, alu_field->src2, mode, reg_info->src2_reg, + shrink_mode, src_mask, is_int, argmod); + } - fprintf(fp, "\n"); + fprintf(fp, "\n"); } static void -print_scalar_src(disassemble_context *ctx, FILE *fp, bool is_int, unsigned src_binary, unsigned reg) +print_scalar_src(disassemble_context *ctx, FILE *fp, bool is_int, + unsigned src_binary, unsigned reg) { - midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary; + midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary; - print_alu_reg(ctx, fp, reg, false); + print_alu_reg(ctx, fp, reg, false); - unsigned c = src->component; + unsigned c = src->component; - if (src->full) { - assert((c & 1) == 0); - c >>= 1; - } + if (src->full) { + assert((c & 1) == 0); + c >>= 1; + } - fprintf(fp, ".%c", components[c]); + fprintf(fp, ".%c", components[c]); - print_srcmod(fp, is_int, !src->full, src->mod, true); + print_srcmod(fp, is_int, !src->full, src->mod, true); } static uint16_t decode_scalar_imm(unsigned src2_reg, unsigned imm) { - uint16_t ret; - ret = src2_reg << 11; - ret |= (imm & 3) << 9; - ret |= (imm & 4) << 6; - ret |= (imm & 0x38) << 2; - ret |= imm >> 6; - return ret; + uint16_t ret; + ret = src2_reg << 11; + ret |= (imm & 3) << 9; + ret |= (imm & 4) << 6; + ret |= (imm & 0x38) << 2; + ret |= imm >> 6; + return ret; } static void @@ -898,241 +901,245 @@ print_scalar_field(disassemble_context *ctx, FILE *fp, const char *name, uint16_t *words, uint16_t reg_word, const midgard_constants *consts, unsigned tabs, bool verbose) { - midgard_reg_info *reg_info = (midgard_reg_info *)®_word; - midgard_scalar_alu *alu_field = (midgard_scalar_alu *) words; - bool is_int = midgard_is_integer_op(alu_field->op); - bool is_int_out = midgard_is_integer_out_op(alu_field->op); - bool full = alu_field->output_full; + midgard_reg_info *reg_info = (midgard_reg_info *)®_word; + midgard_scalar_alu *alu_field = (midgard_scalar_alu *)words; + bool is_int = midgard_is_integer_op(alu_field->op); + bool is_int_out = midgard_is_integer_out_op(alu_field->op); + bool full = alu_field->output_full; - if (alu_field->reserved) - fprintf(fp, "scalar ALU reserved bit set\n"); + if (alu_field->reserved) + fprintf(fp, "scalar ALU reserved bit set\n"); - if (verbose) - fprintf(fp, "%s.", name); + if (verbose) + fprintf(fp, "%s.", name); - bool is_instruction_int = print_alu_opcode(fp, alu_field->op); + bool is_instruction_int = print_alu_opcode(fp, alu_field->op); - /* Print lane width, in this case the lane width is always 32-bit, but - * we print it anyway to make it consistent with the other instructions. */ - fprintf(fp, ".%c32", is_int_out ? 'i' : 'f'); + /* Print lane width, in this case the lane width is always 32-bit, but + * we print it anyway to make it consistent with the other instructions. */ + fprintf(fp, ".%c32", is_int_out ? 'i' : 'f'); - fprintf(fp, " "); + fprintf(fp, " "); - print_dest(ctx, fp, reg_info->out_reg); - unsigned c = alu_field->output_component; + print_dest(ctx, fp, reg_info->out_reg); + unsigned c = alu_field->output_component; - if (full) { - assert((c & 1) == 0); - c >>= 1; - } + if (full) { + assert((c & 1) == 0); + c >>= 1; + } - fprintf(fp, ".%c", components[c]); + fprintf(fp, ".%c", components[c]); - print_alu_outmod(fp, alu_field->outmod, is_int_out, !full); + print_alu_outmod(fp, alu_field->outmod, is_int_out, !full); - fprintf(fp, ", "); + fprintf(fp, ", "); - if (reg_info->src1_reg == REGISTER_CONSTANT) - print_scalar_constant(fp, alu_field->src1, consts, alu_field); - else - print_scalar_src(ctx, fp, is_int, alu_field->src1, reg_info->src1_reg); + if (reg_info->src1_reg == REGISTER_CONSTANT) + print_scalar_constant(fp, alu_field->src1, consts, alu_field); + else + print_scalar_src(ctx, fp, is_int, alu_field->src1, reg_info->src1_reg); - fprintf(fp, ", "); + fprintf(fp, ", "); - if (reg_info->src2_imm) { - uint16_t imm = decode_scalar_imm(reg_info->src2_reg, - alu_field->src2); - print_immediate(fp, imm, is_instruction_int); - } else if (reg_info->src2_reg == REGISTER_CONSTANT) { - print_scalar_constant(fp, alu_field->src2, consts, alu_field); - } else - print_scalar_src(ctx, fp, is_int, alu_field->src2, reg_info->src2_reg); + if (reg_info->src2_imm) { + uint16_t imm = decode_scalar_imm(reg_info->src2_reg, alu_field->src2); + print_immediate(fp, imm, is_instruction_int); + } else if (reg_info->src2_reg == REGISTER_CONSTANT) { + print_scalar_constant(fp, alu_field->src2, consts, alu_field); + } else + print_scalar_src(ctx, fp, is_int, alu_field->src2, reg_info->src2_reg); - fprintf(fp, "\n"); + fprintf(fp, "\n"); } static void print_branch_op(FILE *fp, unsigned op) { - switch (op) { - case midgard_jmp_writeout_op_branch_uncond: - fprintf(fp, "uncond."); - break; + switch (op) { + case midgard_jmp_writeout_op_branch_uncond: + fprintf(fp, "uncond."); + break; - case midgard_jmp_writeout_op_branch_cond: - fprintf(fp, "cond."); - break; + case midgard_jmp_writeout_op_branch_cond: + fprintf(fp, "cond."); + break; - case midgard_jmp_writeout_op_writeout: - fprintf(fp, "write."); - break; + case midgard_jmp_writeout_op_writeout: + fprintf(fp, "write."); + break; - case midgard_jmp_writeout_op_tilebuffer_pending: - fprintf(fp, "tilebuffer."); - break; + case midgard_jmp_writeout_op_tilebuffer_pending: + fprintf(fp, "tilebuffer."); + break; - case midgard_jmp_writeout_op_discard: - fprintf(fp, "discard."); - break; + case midgard_jmp_writeout_op_discard: + fprintf(fp, "discard."); + break; - default: - fprintf(fp, "unk%u.", op); - break; - } + default: + fprintf(fp, "unk%u.", op); + break; + } } static void print_branch_cond(FILE *fp, int cond) { - switch (cond) { - case midgard_condition_write0: - fprintf(fp, "write0"); - break; + switch (cond) { + case midgard_condition_write0: + fprintf(fp, "write0"); + break; - case midgard_condition_false: - fprintf(fp, "false"); - break; + case midgard_condition_false: + fprintf(fp, "false"); + break; - case midgard_condition_true: - fprintf(fp, "true"); - break; + case midgard_condition_true: + fprintf(fp, "true"); + break; - case midgard_condition_always: - fprintf(fp, "always"); - break; + case midgard_condition_always: + fprintf(fp, "always"); + break; - default: - fprintf(fp, "unk%X", cond); - break; - } + default: + fprintf(fp, "unk%X", cond); + break; + } } static const char * function_call_mode(enum midgard_call_mode mode) { - switch (mode) { - case midgard_call_mode_default: return ""; - case midgard_call_mode_call: return ".call"; - case midgard_call_mode_return: return ".return"; - default: return ".reserved"; - } + switch (mode) { + case midgard_call_mode_default: + return ""; + case midgard_call_mode_call: + return ".call"; + case midgard_call_mode_return: + return ".return"; + default: + return ".reserved"; + } } static bool -print_compact_branch_writeout_field(disassemble_context *ctx, FILE *fp, uint16_t word) +print_compact_branch_writeout_field(disassemble_context *ctx, FILE *fp, + uint16_t word) { - midgard_jmp_writeout_op op = word & 0x7; + midgard_jmp_writeout_op op = word & 0x7; - switch (op) { - case midgard_jmp_writeout_op_branch_uncond: { - midgard_branch_uncond br_uncond; - memcpy((char *) &br_uncond, (char *) &word, sizeof(br_uncond)); - fprintf(fp, "br.uncond%s ", function_call_mode(br_uncond.call_mode)); + switch (op) { + case midgard_jmp_writeout_op_branch_uncond: { + midgard_branch_uncond br_uncond; + memcpy((char *)&br_uncond, (char *)&word, sizeof(br_uncond)); + fprintf(fp, "br.uncond%s ", function_call_mode(br_uncond.call_mode)); - if (br_uncond.offset >= 0) - fprintf(fp, "+"); + if (br_uncond.offset >= 0) + fprintf(fp, "+"); - fprintf(fp, "%d -> %s", br_uncond.offset, - midgard_tag_props[br_uncond.dest_tag].name); - fprintf(fp, "\n"); + fprintf(fp, "%d -> %s", br_uncond.offset, + midgard_tag_props[br_uncond.dest_tag].name); + fprintf(fp, "\n"); - return br_uncond.offset >= 0; - } + return br_uncond.offset >= 0; + } - case midgard_jmp_writeout_op_branch_cond: - case midgard_jmp_writeout_op_writeout: - case midgard_jmp_writeout_op_discard: - default: { - midgard_branch_cond br_cond; - memcpy((char *) &br_cond, (char *) &word, sizeof(br_cond)); + case midgard_jmp_writeout_op_branch_cond: + case midgard_jmp_writeout_op_writeout: + case midgard_jmp_writeout_op_discard: + default: { + midgard_branch_cond br_cond; + memcpy((char *)&br_cond, (char *)&word, sizeof(br_cond)); - fprintf(fp, "br."); + fprintf(fp, "br."); - print_branch_op(fp, br_cond.op); - print_branch_cond(fp, br_cond.cond); + print_branch_op(fp, br_cond.op); + print_branch_cond(fp, br_cond.cond); - fprintf(fp, " "); + fprintf(fp, " "); - if (br_cond.offset >= 0) - fprintf(fp, "+"); + if (br_cond.offset >= 0) + fprintf(fp, "+"); - fprintf(fp, "%d -> %s", br_cond.offset, - midgard_tag_props[br_cond.dest_tag].name); - fprintf(fp, "\n"); + fprintf(fp, "%d -> %s", br_cond.offset, + midgard_tag_props[br_cond.dest_tag].name); + fprintf(fp, "\n"); - return br_cond.offset >= 0; - } - } + return br_cond.offset >= 0; + } + } - return false; + return false; } static bool -print_extended_branch_writeout_field(disassemble_context *ctx, FILE *fp, uint8_t *words, - unsigned next) +print_extended_branch_writeout_field(disassemble_context *ctx, FILE *fp, + uint8_t *words, unsigned next) { - midgard_branch_extended br; - memcpy((char *) &br, (char *) words, sizeof(br)); + midgard_branch_extended br; + memcpy((char *)&br, (char *)words, sizeof(br)); - fprintf(fp, "brx%s.", function_call_mode(br.call_mode)); + fprintf(fp, "brx%s.", function_call_mode(br.call_mode)); - print_branch_op(fp, br.op); + print_branch_op(fp, br.op); - /* Condition codes are a LUT in the general case, but simply repeated 8 times for single-channel conditions.. Check this. */ + /* Condition codes are a LUT in the general case, but simply repeated 8 times + * for single-channel conditions.. Check this. */ - bool single_channel = true; + bool single_channel = true; - for (unsigned i = 0; i < 16; i += 2) { - single_channel &= (((br.cond >> i) & 0x3) == (br.cond & 0x3)); - } + for (unsigned i = 0; i < 16; i += 2) { + single_channel &= (((br.cond >> i) & 0x3) == (br.cond & 0x3)); + } - if (single_channel) - print_branch_cond(fp, br.cond & 0x3); - else - fprintf(fp, "lut%X", br.cond); + if (single_channel) + print_branch_cond(fp, br.cond & 0x3); + else + fprintf(fp, "lut%X", br.cond); - fprintf(fp, " "); + fprintf(fp, " "); - if (br.offset >= 0) - fprintf(fp, "+"); + if (br.offset >= 0) + fprintf(fp, "+"); - fprintf(fp, "%d -> %s\n", br.offset, - midgard_tag_props[br.dest_tag].name); + fprintf(fp, "%d -> %s\n", br.offset, midgard_tag_props[br.dest_tag].name); - unsigned I = next + br.offset * 4; + unsigned I = next + br.offset * 4; - if (ctx->midg_tags[I] && ctx->midg_tags[I] != br.dest_tag) { - fprintf(fp, "\t/* XXX TAG ERROR: jumping to %s but tagged %s \n", - midgard_tag_props[br.dest_tag].name, - midgard_tag_props[ctx->midg_tags[I]].name); - } + if (ctx->midg_tags[I] && ctx->midg_tags[I] != br.dest_tag) { + fprintf(fp, "\t/* XXX TAG ERROR: jumping to %s but tagged %s \n", + midgard_tag_props[br.dest_tag].name, + midgard_tag_props[ctx->midg_tags[I]].name); + } - ctx->midg_tags[I] = br.dest_tag; + ctx->midg_tags[I] = br.dest_tag; - return br.offset >= 0; + return br.offset >= 0; } static unsigned num_alu_fields_enabled(uint32_t control_word) { - unsigned ret = 0; + unsigned ret = 0; - if ((control_word >> 17) & 1) - ret++; + if ((control_word >> 17) & 1) + ret++; - if ((control_word >> 19) & 1) - ret++; + if ((control_word >> 19) & 1) + ret++; - if ((control_word >> 21) & 1) - ret++; + if ((control_word >> 21) & 1) + ret++; - if ((control_word >> 23) & 1) - ret++; + if ((control_word >> 23) & 1) + ret++; - if ((control_word >> 25) & 1) - ret++; + if ((control_word >> 25) & 1) + ret++; - return ret; + return ret; } static bool @@ -1140,101 +1147,106 @@ print_alu_word(disassemble_context *ctx, FILE *fp, uint32_t *words, unsigned num_quad_words, unsigned tabs, unsigned next, bool verbose) { - uint32_t control_word = words[0]; - uint16_t *beginning_ptr = (uint16_t *)(words + 1); - unsigned num_fields = num_alu_fields_enabled(control_word); - uint16_t *word_ptr = beginning_ptr + num_fields; - unsigned num_words = 2 + num_fields; - const midgard_constants *consts = NULL; - bool branch_forward = false; + uint32_t control_word = words[0]; + uint16_t *beginning_ptr = (uint16_t *)(words + 1); + unsigned num_fields = num_alu_fields_enabled(control_word); + uint16_t *word_ptr = beginning_ptr + num_fields; + unsigned num_words = 2 + num_fields; + const midgard_constants *consts = NULL; + bool branch_forward = false; - if ((control_word >> 17) & 1) - num_words += 3; + if ((control_word >> 17) & 1) + num_words += 3; - if ((control_word >> 19) & 1) - num_words += 2; + if ((control_word >> 19) & 1) + num_words += 2; - if ((control_word >> 21) & 1) - num_words += 3; + if ((control_word >> 21) & 1) + num_words += 3; - if ((control_word >> 23) & 1) - num_words += 2; + if ((control_word >> 23) & 1) + num_words += 2; - if ((control_word >> 25) & 1) - num_words += 3; + if ((control_word >> 25) & 1) + num_words += 3; - if ((control_word >> 26) & 1) - num_words += 1; + if ((control_word >> 26) & 1) + num_words += 1; - if ((control_word >> 27) & 1) - num_words += 3; + if ((control_word >> 27) & 1) + num_words += 3; - if (num_quad_words > (num_words + 7) / 8) { - assert(num_quad_words == (num_words + 15) / 8); - //Assume that the extra quadword is constants - consts = (midgard_constants *)(words + (4 * num_quad_words - 4)); - } + if (num_quad_words > (num_words + 7) / 8) { + assert(num_quad_words == (num_words + 15) / 8); + // Assume that the extra quadword is constants + consts = (midgard_constants *)(words + (4 * num_quad_words - 4)); + } - if ((control_word >> 16) & 1) - fprintf(fp, "unknown bit 16 enabled\n"); + if ((control_word >> 16) & 1) + fprintf(fp, "unknown bit 16 enabled\n"); - if ((control_word >> 17) & 1) { - print_vector_field(ctx, fp, "vmul", word_ptr, *beginning_ptr, consts, tabs, verbose); - beginning_ptr += 1; - word_ptr += 3; - } + if ((control_word >> 17) & 1) { + print_vector_field(ctx, fp, "vmul", word_ptr, *beginning_ptr, consts, + tabs, verbose); + beginning_ptr += 1; + word_ptr += 3; + } - if ((control_word >> 18) & 1) - fprintf(fp, "unknown bit 18 enabled\n"); + if ((control_word >> 18) & 1) + fprintf(fp, "unknown bit 18 enabled\n"); - if ((control_word >> 19) & 1) { - print_scalar_field(ctx, fp, "sadd", word_ptr, *beginning_ptr, consts, tabs, verbose); - beginning_ptr += 1; - word_ptr += 2; - } + if ((control_word >> 19) & 1) { + print_scalar_field(ctx, fp, "sadd", word_ptr, *beginning_ptr, consts, + tabs, verbose); + beginning_ptr += 1; + word_ptr += 2; + } - if ((control_word >> 20) & 1) - fprintf(fp, "unknown bit 20 enabled\n"); + if ((control_word >> 20) & 1) + fprintf(fp, "unknown bit 20 enabled\n"); - if ((control_word >> 21) & 1) { - print_vector_field(ctx, fp, "vadd", word_ptr, *beginning_ptr, consts, tabs, verbose); - beginning_ptr += 1; - word_ptr += 3; - } + if ((control_word >> 21) & 1) { + print_vector_field(ctx, fp, "vadd", word_ptr, *beginning_ptr, consts, + tabs, verbose); + beginning_ptr += 1; + word_ptr += 3; + } - if ((control_word >> 22) & 1) - fprintf(fp, "unknown bit 22 enabled\n"); + if ((control_word >> 22) & 1) + fprintf(fp, "unknown bit 22 enabled\n"); - if ((control_word >> 23) & 1) { - print_scalar_field(ctx, fp, "smul", word_ptr, *beginning_ptr, consts, tabs, verbose); - beginning_ptr += 1; - word_ptr += 2; - } + if ((control_word >> 23) & 1) { + print_scalar_field(ctx, fp, "smul", word_ptr, *beginning_ptr, consts, + tabs, verbose); + beginning_ptr += 1; + word_ptr += 2; + } - if ((control_word >> 24) & 1) - fprintf(fp, "unknown bit 24 enabled\n"); + if ((control_word >> 24) & 1) + fprintf(fp, "unknown bit 24 enabled\n"); - if ((control_word >> 25) & 1) { - print_vector_field(ctx, fp, "lut", word_ptr, *beginning_ptr, consts, tabs, verbose); - word_ptr += 3; - } + if ((control_word >> 25) & 1) { + print_vector_field(ctx, fp, "lut", word_ptr, *beginning_ptr, consts, tabs, + verbose); + word_ptr += 3; + } - if ((control_word >> 26) & 1) { - branch_forward |= print_compact_branch_writeout_field(ctx, fp, *word_ptr); - word_ptr += 1; - } + if ((control_word >> 26) & 1) { + branch_forward |= print_compact_branch_writeout_field(ctx, fp, *word_ptr); + word_ptr += 1; + } - if ((control_word >> 27) & 1) { - branch_forward |= print_extended_branch_writeout_field(ctx, fp, (uint8_t *) word_ptr, next); - word_ptr += 3; - } + if ((control_word >> 27) & 1) { + branch_forward |= print_extended_branch_writeout_field( + ctx, fp, (uint8_t *)word_ptr, next); + word_ptr += 3; + } - if (consts) - fprintf(fp, "uconstants 0x%X, 0x%X, 0x%X, 0x%X\n", - consts->u32[0], consts->u32[1], - consts->u32[2], consts->u32[3]); + if (consts) + fprintf(fp, "uconstants 0x%X, 0x%X, 0x%X, 0x%X\n", consts->u32[0], + consts->u32[1], consts->u32[2], consts->u32[3]); - return branch_forward; + return branch_forward; } /* TODO: how can we use this now that we know that these params can't be known @@ -1242,349 +1254,346 @@ print_alu_word(disassemble_context *ctx, FILE *fp, uint32_t *words, UNUSED static void print_varying_parameters(FILE *fp, midgard_load_store_word *word) { - midgard_varying_params p = midgard_unpack_varying_params(*word); + midgard_varying_params p = midgard_unpack_varying_params(*word); - /* If a varying, there are qualifiers */ - if (p.flat_shading) - fprintf(fp, ".flat"); + /* If a varying, there are qualifiers */ + if (p.flat_shading) + fprintf(fp, ".flat"); - if (p.perspective_correction) - fprintf(fp, ".correction"); + if (p.perspective_correction) + fprintf(fp, ".correction"); - if (p.centroid_mapping) - fprintf(fp, ".centroid"); + if (p.centroid_mapping) + fprintf(fp, ".centroid"); - if (p.interpolate_sample) - fprintf(fp, ".sample"); + if (p.interpolate_sample) + fprintf(fp, ".sample"); - switch (p.modifier) { - case midgard_varying_mod_perspective_y: - fprintf(fp, ".perspectivey"); - break; - case midgard_varying_mod_perspective_z: - fprintf(fp, ".perspectivez"); - break; - case midgard_varying_mod_perspective_w: - fprintf(fp, ".perspectivew"); - break; - default: - unreachable("invalid varying modifier"); - break; - } + switch (p.modifier) { + case midgard_varying_mod_perspective_y: + fprintf(fp, ".perspectivey"); + break; + case midgard_varying_mod_perspective_z: + fprintf(fp, ".perspectivez"); + break; + case midgard_varying_mod_perspective_w: + fprintf(fp, ".perspectivew"); + break; + default: + unreachable("invalid varying modifier"); + break; + } } /* Helper to print integer well-formatted, but only when non-zero. */ static void midgard_print_sint(FILE *fp, int n) { - if (n > 0) - fprintf(fp, " + 0x%X", n); - else if (n < 0) - fprintf(fp, " - 0x%X", -n); + if (n > 0) + fprintf(fp, " + 0x%X", n); + else if (n < 0) + fprintf(fp, " - 0x%X", -n); } static void -print_load_store_instr(disassemble_context *ctx, FILE *fp, uint64_t data, bool verbose) +print_load_store_instr(disassemble_context *ctx, FILE *fp, uint64_t data, + bool verbose) { - midgard_load_store_word *word = (midgard_load_store_word *) &data; + midgard_load_store_word *word = (midgard_load_store_word *)&data; - print_ld_st_opcode(fp, word->op); + print_ld_st_opcode(fp, word->op); - if (word->op == midgard_op_trap) { - fprintf(fp, " 0x%X\n", word->signed_offset); - return; - } + if (word->op == midgard_op_trap) { + fprintf(fp, " 0x%X\n", word->signed_offset); + return; + } - /* Print opcode modifiers */ + /* Print opcode modifiers */ - if (OP_USES_ATTRIB(word->op)) { - /* Print non-default attribute tables */ - bool default_secondary = - (word->op == midgard_op_st_vary_32) || - (word->op == midgard_op_st_vary_16) || - (word->op == midgard_op_st_vary_32u) || - (word->op == midgard_op_st_vary_32i) || - (word->op == midgard_op_ld_vary_32) || - (word->op == midgard_op_ld_vary_16) || - (word->op == midgard_op_ld_vary_32u) || - (word->op == midgard_op_ld_vary_32i); + if (OP_USES_ATTRIB(word->op)) { + /* Print non-default attribute tables */ + bool default_secondary = (word->op == midgard_op_st_vary_32) || + (word->op == midgard_op_st_vary_16) || + (word->op == midgard_op_st_vary_32u) || + (word->op == midgard_op_st_vary_32i) || + (word->op == midgard_op_ld_vary_32) || + (word->op == midgard_op_ld_vary_16) || + (word->op == midgard_op_ld_vary_32u) || + (word->op == midgard_op_ld_vary_32i); - bool default_primary = - (word->op == midgard_op_ld_attr_32) || - (word->op == midgard_op_ld_attr_16) || - (word->op == midgard_op_ld_attr_32u) || - (word->op == midgard_op_ld_attr_32i); + bool default_primary = (word->op == midgard_op_ld_attr_32) || + (word->op == midgard_op_ld_attr_16) || + (word->op == midgard_op_ld_attr_32u) || + (word->op == midgard_op_ld_attr_32i); - bool has_default = (default_secondary || default_primary); - bool auto32 = (word->index_format >> 0) & 1; - bool is_secondary = (word->index_format >> 1) & 1; + bool has_default = (default_secondary || default_primary); + bool auto32 = (word->index_format >> 0) & 1; + bool is_secondary = (word->index_format >> 1) & 1; - if (auto32) - fprintf(fp, ".a32"); + if (auto32) + fprintf(fp, ".a32"); - if (has_default && (is_secondary != default_secondary)) - fprintf(fp, ".%s", is_secondary ? "secondary" : "primary"); - } else if (word->op == midgard_op_ld_cubemap_coords || OP_IS_PROJECTION(word->op)) - fprintf(fp, ".%s", word->bitsize_toggle ? "f32" : "f16"); + if (has_default && (is_secondary != default_secondary)) + fprintf(fp, ".%s", is_secondary ? "secondary" : "primary"); + } else if (word->op == midgard_op_ld_cubemap_coords || + OP_IS_PROJECTION(word->op)) + fprintf(fp, ".%s", word->bitsize_toggle ? "f32" : "f16"); - fprintf(fp, " "); + fprintf(fp, " "); - /* src/dest register */ + /* src/dest register */ - if (!OP_IS_STORE(word->op)) { - print_ldst_write_reg(fp, word->reg); + if (!OP_IS_STORE(word->op)) { + print_ldst_write_reg(fp, word->reg); - /* Some opcodes don't have a swizzable src register, and - * instead the swizzle is applied before the result is written - * to the dest reg. For these ops, we combine the writemask - * with the swizzle to display them in the disasm compactly. */ - unsigned swizzle = word->swizzle; - if ((OP_IS_REG2REG_LDST(word->op) && - word->op != midgard_op_lea && - word->op != midgard_op_lea_image) || OP_IS_ATOMIC(word->op)) - swizzle = 0xE4; - print_ldst_mask(fp, word->mask, swizzle); - } else { - uint8_t mask = - (word->mask & 0x1) | - ((word->mask & 0x2) << 1) | - ((word->mask & 0x4) << 2) | - ((word->mask & 0x8) << 3); - mask |= mask << 1; - print_ldst_read_reg(fp, word->reg); - print_vec_swizzle(fp, word->swizzle, midgard_src_passthrough, - midgard_reg_mode_32, mask); - } + /* Some opcodes don't have a swizzable src register, and + * instead the swizzle is applied before the result is written + * to the dest reg. For these ops, we combine the writemask + * with the swizzle to display them in the disasm compactly. */ + unsigned swizzle = word->swizzle; + if ((OP_IS_REG2REG_LDST(word->op) && word->op != midgard_op_lea && + word->op != midgard_op_lea_image) || + OP_IS_ATOMIC(word->op)) + swizzle = 0xE4; + print_ldst_mask(fp, word->mask, swizzle); + } else { + uint8_t mask = (word->mask & 0x1) | ((word->mask & 0x2) << 1) | + ((word->mask & 0x4) << 2) | ((word->mask & 0x8) << 3); + mask |= mask << 1; + print_ldst_read_reg(fp, word->reg); + print_vec_swizzle(fp, word->swizzle, midgard_src_passthrough, + midgard_reg_mode_32, mask); + } - /* ld_ubo args */ - if (OP_IS_UBO_READ(word->op)) { - if (word->signed_offset & 1) { /* buffer index imm */ - unsigned imm = midgard_unpack_ubo_index_imm(*word); - fprintf(fp, ", %u", imm); - } else { /* buffer index from reg */ - fprintf(fp, ", "); - print_ldst_read_reg(fp, word->arg_reg); - fprintf(fp, ".%c", components[word->arg_comp]); - } + /* ld_ubo args */ + if (OP_IS_UBO_READ(word->op)) { + if (word->signed_offset & 1) { /* buffer index imm */ + unsigned imm = midgard_unpack_ubo_index_imm(*word); + fprintf(fp, ", %u", imm); + } else { /* buffer index from reg */ + fprintf(fp, ", "); + print_ldst_read_reg(fp, word->arg_reg); + fprintf(fp, ".%c", components[word->arg_comp]); + } - fprintf(fp, ", "); - print_ldst_read_reg(fp, word->index_reg); - fprintf(fp, ".%c", components[word->index_comp]); - if (word->index_shift) - fprintf(fp, " << %u", word->index_shift); - midgard_print_sint(fp, UNPACK_LDST_UBO_OFS(word->signed_offset)); - } + fprintf(fp, ", "); + print_ldst_read_reg(fp, word->index_reg); + fprintf(fp, ".%c", components[word->index_comp]); + if (word->index_shift) + fprintf(fp, " << %u", word->index_shift); + midgard_print_sint(fp, UNPACK_LDST_UBO_OFS(word->signed_offset)); + } - /* mem addr expression */ - if (OP_HAS_ADDRESS(word->op)) { - fprintf(fp, ", "); - bool first = true; + /* mem addr expression */ + if (OP_HAS_ADDRESS(word->op)) { + fprintf(fp, ", "); + bool first = true; - /* Skip printing zero */ - if (word->arg_reg != 7 || verbose) { - print_ldst_read_reg(fp, word->arg_reg); - fprintf(fp, ".u%d.%c", - word->bitsize_toggle ? 64 : 32, components[word->arg_comp]); - first = false; - } + /* Skip printing zero */ + if (word->arg_reg != 7 || verbose) { + print_ldst_read_reg(fp, word->arg_reg); + fprintf(fp, ".u%d.%c", word->bitsize_toggle ? 64 : 32, + components[word->arg_comp]); + first = false; + } - if ((word->op < midgard_op_atomic_cmpxchg || - word->op > midgard_op_atomic_cmpxchg64_be) && - word->index_reg != 0x7) { - if (!first) - fprintf(fp, " + "); + if ((word->op < midgard_op_atomic_cmpxchg || + word->op > midgard_op_atomic_cmpxchg64_be) && + word->index_reg != 0x7) { + if (!first) + fprintf(fp, " + "); - print_ldst_read_reg(fp, word->index_reg); - fprintf(fp, "%s.%c", - index_format_names[word->index_format], - components[word->index_comp]); - if (word->index_shift) - fprintf(fp, " << %u", word->index_shift); - } + print_ldst_read_reg(fp, word->index_reg); + fprintf(fp, "%s.%c", index_format_names[word->index_format], + components[word->index_comp]); + if (word->index_shift) + fprintf(fp, " << %u", word->index_shift); + } - midgard_print_sint(fp, word->signed_offset); - } + midgard_print_sint(fp, word->signed_offset); + } - /* src reg for reg2reg ldst opcodes */ - if (OP_IS_REG2REG_LDST(word->op)) { - fprintf(fp, ", "); - print_ldst_read_reg(fp, word->arg_reg); - print_vec_swizzle(fp, word->swizzle, midgard_src_passthrough, - midgard_reg_mode_32, 0xFF); - } + /* src reg for reg2reg ldst opcodes */ + if (OP_IS_REG2REG_LDST(word->op)) { + fprintf(fp, ", "); + print_ldst_read_reg(fp, word->arg_reg); + print_vec_swizzle(fp, word->swizzle, midgard_src_passthrough, + midgard_reg_mode_32, 0xFF); + } - /* atomic ops encode the source arg where the ldst swizzle would be. */ - if (OP_IS_ATOMIC(word->op)) { - unsigned src = (word->swizzle >> 2) & 0x7; - unsigned src_comp = word->swizzle & 0x3; - fprintf(fp, ", "); - print_ldst_read_reg(fp, src); - fprintf(fp, ".%c", components[src_comp]); - } + /* atomic ops encode the source arg where the ldst swizzle would be. */ + if (OP_IS_ATOMIC(word->op)) { + unsigned src = (word->swizzle >> 2) & 0x7; + unsigned src_comp = word->swizzle & 0x3; + fprintf(fp, ", "); + print_ldst_read_reg(fp, src); + fprintf(fp, ".%c", components[src_comp]); + } - /* CMPXCHG encodes the extra comparison arg where the index reg would be. */ - if (word->op >= midgard_op_atomic_cmpxchg && - word->op <= midgard_op_atomic_cmpxchg64_be) { - fprintf(fp, ", "); - print_ldst_read_reg(fp, word->index_reg); - fprintf(fp, ".%c", components[word->index_comp]); - } + /* CMPXCHG encodes the extra comparison arg where the index reg would be. */ + if (word->op >= midgard_op_atomic_cmpxchg && + word->op <= midgard_op_atomic_cmpxchg64_be) { + fprintf(fp, ", "); + print_ldst_read_reg(fp, word->index_reg); + fprintf(fp, ".%c", components[word->index_comp]); + } - /* index reg for attr/vary/images, selector for ld/st_special */ - if (OP_IS_SPECIAL(word->op) || OP_USES_ATTRIB(word->op)) { - fprintf(fp, ", "); - print_ldst_read_reg(fp, word->index_reg); - fprintf(fp, ".%c", components[word->index_comp]); - if (word->index_shift) - fprintf(fp, " << %u", word->index_shift); - midgard_print_sint(fp, UNPACK_LDST_ATTRIB_OFS(word->signed_offset)); - } + /* index reg for attr/vary/images, selector for ld/st_special */ + if (OP_IS_SPECIAL(word->op) || OP_USES_ATTRIB(word->op)) { + fprintf(fp, ", "); + print_ldst_read_reg(fp, word->index_reg); + fprintf(fp, ".%c", components[word->index_comp]); + if (word->index_shift) + fprintf(fp, " << %u", word->index_shift); + midgard_print_sint(fp, UNPACK_LDST_ATTRIB_OFS(word->signed_offset)); + } - /* vertex reg for attrib/varying ops, coord reg for image ops */ - if (OP_USES_ATTRIB(word->op)) { - fprintf(fp, ", "); - print_ldst_read_reg(fp, word->arg_reg); + /* vertex reg for attrib/varying ops, coord reg for image ops */ + if (OP_USES_ATTRIB(word->op)) { + fprintf(fp, ", "); + print_ldst_read_reg(fp, word->arg_reg); - if (OP_IS_IMAGE(word->op)) - fprintf(fp, ".u%d", word->bitsize_toggle ? 64 : 32); + if (OP_IS_IMAGE(word->op)) + fprintf(fp, ".u%d", word->bitsize_toggle ? 64 : 32); - fprintf(fp, ".%c", components[word->arg_comp]); + fprintf(fp, ".%c", components[word->arg_comp]); - if (word->bitsize_toggle && !OP_IS_IMAGE(word->op)) - midgard_print_sint(fp, UNPACK_LDST_VERTEX_OFS(word->signed_offset)); - } + if (word->bitsize_toggle && !OP_IS_IMAGE(word->op)) + midgard_print_sint(fp, UNPACK_LDST_VERTEX_OFS(word->signed_offset)); + } - /* TODO: properly decode format specifier for PACK/UNPACK ops */ - if (OP_IS_PACK_COLOUR(word->op) || OP_IS_UNPACK_COLOUR(word->op)) { - fprintf(fp, ", "); - unsigned format_specifier = (word->signed_offset << 4) | word->index_shift; - fprintf(fp, "0x%X", format_specifier); - } + /* TODO: properly decode format specifier for PACK/UNPACK ops */ + if (OP_IS_PACK_COLOUR(word->op) || OP_IS_UNPACK_COLOUR(word->op)) { + fprintf(fp, ", "); + unsigned format_specifier = + (word->signed_offset << 4) | word->index_shift; + fprintf(fp, "0x%X", format_specifier); + } - fprintf(fp, "\n"); + fprintf(fp, "\n"); - /* Debugging stuff */ + /* Debugging stuff */ - if (!OP_IS_STORE(word->op)) - update_dest(ctx, word->reg); + if (!OP_IS_STORE(word->op)) + update_dest(ctx, word->reg); } static void -print_load_store_word(disassemble_context *ctx, FILE *fp, uint32_t *word, bool verbose) +print_load_store_word(disassemble_context *ctx, FILE *fp, uint32_t *word, + bool verbose) { - midgard_load_store *load_store = (midgard_load_store *) word; + midgard_load_store *load_store = (midgard_load_store *)word; - if (load_store->word1 != 3) { - print_load_store_instr(ctx, fp, load_store->word1, verbose); - } + if (load_store->word1 != 3) { + print_load_store_instr(ctx, fp, load_store->word1, verbose); + } - if (load_store->word2 != 3) { - print_load_store_instr(ctx, fp, load_store->word2, verbose); - } + if (load_store->word2 != 3) { + print_load_store_instr(ctx, fp, load_store->word2, verbose); + } } static void print_texture_reg_select(FILE *fp, uint8_t u, unsigned base) { - midgard_tex_register_select sel; - memcpy(&sel, &u, sizeof(u)); + midgard_tex_register_select sel; + memcpy(&sel, &u, sizeof(u)); - print_tex_reg(fp, base + sel.select, false); + print_tex_reg(fp, base + sel.select, false); - unsigned component = sel.component; + unsigned component = sel.component; - /* Use the upper half in half-reg mode */ - if (sel.upper) { - assert(!sel.full); - component += 4; - } + /* Use the upper half in half-reg mode */ + if (sel.upper) { + assert(!sel.full); + component += 4; + } - fprintf(fp, ".%c.%d", components[component], sel.full ? 32 : 16); + fprintf(fp, ".%c.%d", components[component], sel.full ? 32 : 16); - assert(sel.zero == 0); + assert(sel.zero == 0); } static void print_texture_format(FILE *fp, int format) { - /* Act like a modifier */ - fprintf(fp, "."); + /* Act like a modifier */ + fprintf(fp, "."); - switch (format) { - DEFINE_CASE(1, "1d"); - DEFINE_CASE(2, "2d"); - DEFINE_CASE(3, "3d"); - DEFINE_CASE(0, "cube"); + switch (format) { + DEFINE_CASE(1, "1d"); + DEFINE_CASE(2, "2d"); + DEFINE_CASE(3, "3d"); + DEFINE_CASE(0, "cube"); - default: - unreachable("Bad format"); - } + default: + unreachable("Bad format"); + } } static void print_texture_op(FILE *fp, unsigned op) { - if (tex_opcode_props[op].name) - fprintf(fp, "%s", tex_opcode_props[op].name); - else - fprintf(fp, "tex_op_%02X", op); + if (tex_opcode_props[op].name) + fprintf(fp, "%s", tex_opcode_props[op].name); + else + fprintf(fp, "tex_op_%02X", op); } static bool texture_op_takes_bias(unsigned op) { - return op == midgard_tex_op_normal; + return op == midgard_tex_op_normal; } static char sampler_type_name(enum mali_sampler_type t) { - switch (t) { - case MALI_SAMPLER_FLOAT: - return 'f'; - case MALI_SAMPLER_UNSIGNED: - return 'u'; - case MALI_SAMPLER_SIGNED: - return 'i'; - default: - return '?'; - } - + switch (t) { + case MALI_SAMPLER_FLOAT: + return 'f'; + case MALI_SAMPLER_UNSIGNED: + return 'u'; + case MALI_SAMPLER_SIGNED: + return 'i'; + default: + return '?'; + } } static void print_texture_barrier(FILE *fp, uint32_t *word) { - midgard_texture_barrier_word *barrier = (midgard_texture_barrier_word *) word; + midgard_texture_barrier_word *barrier = (midgard_texture_barrier_word *)word; - if (barrier->type != TAG_TEXTURE_4_BARRIER) - fprintf(fp, "/* barrier tag %X != tex/bar */ ", barrier->type); + if (barrier->type != TAG_TEXTURE_4_BARRIER) + fprintf(fp, "/* barrier tag %X != tex/bar */ ", barrier->type); - if (!barrier->cont) - fprintf(fp, "/* cont missing? */"); + if (!barrier->cont) + fprintf(fp, "/* cont missing? */"); - if (!barrier->last) - fprintf(fp, "/* last missing? */"); + if (!barrier->last) + fprintf(fp, "/* last missing? */"); - if (barrier->zero1) - fprintf(fp, "/* zero1 = 0x%X */ ", barrier->zero1); + if (barrier->zero1) + fprintf(fp, "/* zero1 = 0x%X */ ", barrier->zero1); - if (barrier->zero2) - fprintf(fp, "/* zero2 = 0x%X */ ", barrier->zero2); + if (barrier->zero2) + fprintf(fp, "/* zero2 = 0x%X */ ", barrier->zero2); - if (barrier->zero3) - fprintf(fp, "/* zero3 = 0x%X */ ", barrier->zero3); + if (barrier->zero3) + fprintf(fp, "/* zero3 = 0x%X */ ", barrier->zero3); - if (barrier->zero4) - fprintf(fp, "/* zero4 = 0x%X */ ", barrier->zero4); + if (barrier->zero4) + fprintf(fp, "/* zero4 = 0x%X */ ", barrier->zero4); - if (barrier->zero5) - fprintf(fp, "/* zero4 = 0x%" PRIx64 " */ ", barrier->zero5); + if (barrier->zero5) + fprintf(fp, "/* zero4 = 0x%" PRIx64 " */ ", barrier->zero5); - if (barrier->out_of_order) - fprintf(fp, ".ooo%u", barrier->out_of_order); + if (barrier->out_of_order) + fprintf(fp, ".ooo%u", barrier->out_of_order); - fprintf(fp, "\n"); + fprintf(fp, "\n"); } #undef DEFINE_CASE @@ -1592,334 +1601,352 @@ print_texture_barrier(FILE *fp, uint32_t *word) static const char * texture_mode(enum mali_texture_mode mode) { - switch (mode) { - case TEXTURE_NORMAL: return ""; - case TEXTURE_SHADOW: return ".shadow"; - case TEXTURE_GATHER_SHADOW: return ".gather.shadow"; - case TEXTURE_GATHER_X: return ".gatherX"; - case TEXTURE_GATHER_Y: return ".gatherY"; - case TEXTURE_GATHER_Z: return ".gatherZ"; - case TEXTURE_GATHER_W: return ".gatherW"; - default: return "unk"; - } + switch (mode) { + case TEXTURE_NORMAL: + return ""; + case TEXTURE_SHADOW: + return ".shadow"; + case TEXTURE_GATHER_SHADOW: + return ".gather.shadow"; + case TEXTURE_GATHER_X: + return ".gatherX"; + case TEXTURE_GATHER_Y: + return ".gatherY"; + case TEXTURE_GATHER_Z: + return ".gatherZ"; + case TEXTURE_GATHER_W: + return ".gatherW"; + default: + return "unk"; + } } static const char * derivative_mode(enum mali_derivative_mode mode) { - switch (mode) { - case TEXTURE_DFDX: return ".x"; - case TEXTURE_DFDY: return ".y"; - default: return "unk"; - } + switch (mode) { + case TEXTURE_DFDX: + return ".x"; + case TEXTURE_DFDY: + return ".y"; + default: + return "unk"; + } } static const char * partial_exection_mode(enum midgard_partial_execution mode) { - switch (mode) { - case MIDGARD_PARTIAL_EXECUTION_NONE: return ""; - case MIDGARD_PARTIAL_EXECUTION_SKIP: return ".skip"; - case MIDGARD_PARTIAL_EXECUTION_KILL: return ".kill"; - default: return ".reserved"; - } + switch (mode) { + case MIDGARD_PARTIAL_EXECUTION_NONE: + return ""; + case MIDGARD_PARTIAL_EXECUTION_SKIP: + return ".skip"; + case MIDGARD_PARTIAL_EXECUTION_KILL: + return ".kill"; + default: + return ".reserved"; + } } static void print_texture_word(disassemble_context *ctx, FILE *fp, uint32_t *word, unsigned tabs, unsigned in_reg_base, unsigned out_reg_base) { - midgard_texture_word *texture = (midgard_texture_word *) word; - validate_sampler_type(texture->op, texture->sampler_type); + midgard_texture_word *texture = (midgard_texture_word *)word; + validate_sampler_type(texture->op, texture->sampler_type); - /* Broad category of texture operation in question */ - print_texture_op(fp, texture->op); + /* Broad category of texture operation in question */ + print_texture_op(fp, texture->op); - /* Barriers use a dramatically different code path */ - if (texture->op == midgard_tex_op_barrier) { - print_texture_barrier(fp, word); - return; - } else if (texture->type == TAG_TEXTURE_4_BARRIER) - fprintf (fp, "/* nonbarrier had tex/bar tag */ "); - else if (texture->type == TAG_TEXTURE_4_VTX) - fprintf (fp, ".vtx"); + /* Barriers use a dramatically different code path */ + if (texture->op == midgard_tex_op_barrier) { + print_texture_barrier(fp, word); + return; + } else if (texture->type == TAG_TEXTURE_4_BARRIER) + fprintf(fp, "/* nonbarrier had tex/bar tag */ "); + else if (texture->type == TAG_TEXTURE_4_VTX) + fprintf(fp, ".vtx"); - if (texture->op == midgard_tex_op_derivative) - fprintf(fp, "%s", derivative_mode(texture->mode)); - else - fprintf(fp, "%s", texture_mode(texture->mode)); + if (texture->op == midgard_tex_op_derivative) + fprintf(fp, "%s", derivative_mode(texture->mode)); + else + fprintf(fp, "%s", texture_mode(texture->mode)); - /* Specific format in question */ - print_texture_format(fp, texture->format); + /* Specific format in question */ + print_texture_format(fp, texture->format); - /* Instruction "modifiers" parallel the ALU instructions. */ - fputs(partial_exection_mode(texture->exec), fp); + /* Instruction "modifiers" parallel the ALU instructions. */ + fputs(partial_exection_mode(texture->exec), fp); - if (texture->out_of_order) - fprintf(fp, ".ooo%u", texture->out_of_order); + if (texture->out_of_order) + fprintf(fp, ".ooo%u", texture->out_of_order); - fprintf(fp, " "); - print_tex_reg(fp, out_reg_base + texture->out_reg_select, true); - print_tex_mask(fp, texture->mask, texture->out_upper); - fprintf(fp, ".%c%d", texture->sampler_type == MALI_SAMPLER_FLOAT ? 'f' : 'i', - texture->out_full ? 32 : 16); - assert(!(texture->out_full && texture->out_upper)); + fprintf(fp, " "); + print_tex_reg(fp, out_reg_base + texture->out_reg_select, true); + print_tex_mask(fp, texture->mask, texture->out_upper); + fprintf(fp, ".%c%d", texture->sampler_type == MALI_SAMPLER_FLOAT ? 'f' : 'i', + texture->out_full ? 32 : 16); + assert(!(texture->out_full && texture->out_upper)); - /* Output modifiers are only valid for float texture operations */ - if (texture->sampler_type == MALI_SAMPLER_FLOAT) - mir_print_outmod(fp, texture->outmod, false); + /* Output modifiers are only valid for float texture operations */ + if (texture->sampler_type == MALI_SAMPLER_FLOAT) + mir_print_outmod(fp, texture->outmod, false); - fprintf(fp, ", "); + fprintf(fp, ", "); - /* Depending on whether we read from textures directly or indirectly, - * we may be able to update our analysis */ + /* Depending on whether we read from textures directly or indirectly, + * we may be able to update our analysis */ - if (texture->texture_register) { - fprintf(fp, "texture["); - print_texture_reg_select(fp, texture->texture_handle, in_reg_base); - fprintf(fp, "], "); - } else { - fprintf(fp, "texture%u, ", texture->texture_handle); - } + if (texture->texture_register) { + fprintf(fp, "texture["); + print_texture_reg_select(fp, texture->texture_handle, in_reg_base); + fprintf(fp, "], "); + } else { + fprintf(fp, "texture%u, ", texture->texture_handle); + } - /* Print the type, GL style */ - fprintf(fp, "%csampler", sampler_type_name(texture->sampler_type)); + /* Print the type, GL style */ + fprintf(fp, "%csampler", sampler_type_name(texture->sampler_type)); - if (texture->sampler_register) { - fprintf(fp, "["); - print_texture_reg_select(fp, texture->sampler_handle, in_reg_base); - fprintf(fp, "]"); - } else { - fprintf(fp, "%u", texture->sampler_handle); - } + if (texture->sampler_register) { + fprintf(fp, "["); + print_texture_reg_select(fp, texture->sampler_handle, in_reg_base); + fprintf(fp, "]"); + } else { + fprintf(fp, "%u", texture->sampler_handle); + } - print_vec_swizzle(fp, texture->swizzle, midgard_src_passthrough, midgard_reg_mode_32, 0xFF); + print_vec_swizzle(fp, texture->swizzle, midgard_src_passthrough, + midgard_reg_mode_32, 0xFF); - fprintf(fp, ", "); + fprintf(fp, ", "); - midgard_src_expand_mode exp = - texture->in_reg_upper ? midgard_src_expand_high : midgard_src_passthrough; - print_tex_reg(fp, in_reg_base + texture->in_reg_select, false); - print_vec_swizzle(fp, texture->in_reg_swizzle, exp, midgard_reg_mode_32, 0xFF); - fprintf(fp, ".%d", texture->in_reg_full ? 32 : 16); - assert(!(texture->in_reg_full && texture->in_reg_upper)); + midgard_src_expand_mode exp = + texture->in_reg_upper ? midgard_src_expand_high : midgard_src_passthrough; + print_tex_reg(fp, in_reg_base + texture->in_reg_select, false); + print_vec_swizzle(fp, texture->in_reg_swizzle, exp, midgard_reg_mode_32, + 0xFF); + fprintf(fp, ".%d", texture->in_reg_full ? 32 : 16); + assert(!(texture->in_reg_full && texture->in_reg_upper)); - /* There is *always* an offset attached. Of - * course, that offset is just immediate #0 for a - * GLES call that doesn't take an offset. If there - * is a non-negative non-zero offset, this is - * specified in immediate offset mode, with the - * values in the offset_* fields as immediates. If - * this is a negative offset, we instead switch to - * a register offset mode, where the offset_* - * fields become register triplets */ + /* There is *always* an offset attached. Of + * course, that offset is just immediate #0 for a + * GLES call that doesn't take an offset. If there + * is a non-negative non-zero offset, this is + * specified in immediate offset mode, with the + * values in the offset_* fields as immediates. If + * this is a negative offset, we instead switch to + * a register offset mode, where the offset_* + * fields become register triplets */ - if (texture->offset_register) { - fprintf(fp, " + "); + if (texture->offset_register) { + fprintf(fp, " + "); - bool full = texture->offset & 1; - bool select = texture->offset & 2; - bool upper = texture->offset & 4; - unsigned swizzle = texture->offset >> 3; - midgard_src_expand_mode exp = - upper ? midgard_src_expand_high : midgard_src_passthrough; + bool full = texture->offset & 1; + bool select = texture->offset & 2; + bool upper = texture->offset & 4; + unsigned swizzle = texture->offset >> 3; + midgard_src_expand_mode exp = + upper ? midgard_src_expand_high : midgard_src_passthrough; - print_tex_reg(fp, in_reg_base + select, false); - print_vec_swizzle(fp, swizzle, exp, midgard_reg_mode_32, 0xFF); - fprintf(fp, ".%d", full ? 32 : 16); - assert(!(texture->out_full && texture->out_upper)); + print_tex_reg(fp, in_reg_base + select, false); + print_vec_swizzle(fp, swizzle, exp, midgard_reg_mode_32, 0xFF); + fprintf(fp, ".%d", full ? 32 : 16); + assert(!(texture->out_full && texture->out_upper)); - fprintf(fp, ", "); - } else if (texture->offset) { - /* Only select ops allow negative immediate offsets, verify */ + fprintf(fp, ", "); + } else if (texture->offset) { + /* Only select ops allow negative immediate offsets, verify */ - signed offset_x = (texture->offset & 0xF); - signed offset_y = ((texture->offset >> 4) & 0xF); - signed offset_z = ((texture->offset >> 8) & 0xF); + signed offset_x = (texture->offset & 0xF); + signed offset_y = ((texture->offset >> 4) & 0xF); + signed offset_z = ((texture->offset >> 8) & 0xF); - bool neg_x = offset_x < 0; - bool neg_y = offset_y < 0; - bool neg_z = offset_z < 0; - bool any_neg = neg_x || neg_y || neg_z; + bool neg_x = offset_x < 0; + bool neg_y = offset_y < 0; + bool neg_z = offset_z < 0; + bool any_neg = neg_x || neg_y || neg_z; - if (any_neg && texture->op != midgard_tex_op_fetch) - fprintf(fp, "/* invalid negative */ "); + if (any_neg && texture->op != midgard_tex_op_fetch) + fprintf(fp, "/* invalid negative */ "); - /* Regardless, just print the immediate offset */ + /* Regardless, just print the immediate offset */ - fprintf(fp, " + <%d, %d, %d>, ", offset_x, offset_y, offset_z); - } else { - fprintf(fp, ", "); - } + fprintf(fp, " + <%d, %d, %d>, ", offset_x, offset_y, offset_z); + } else { + fprintf(fp, ", "); + } - char lod_operand = texture_op_takes_bias(texture->op) ? '+' : '='; + char lod_operand = texture_op_takes_bias(texture->op) ? '+' : '='; - if (texture->lod_register) { - fprintf(fp, "lod %c ", lod_operand); - print_texture_reg_select(fp, texture->bias, in_reg_base); - fprintf(fp, ", "); + if (texture->lod_register) { + fprintf(fp, "lod %c ", lod_operand); + print_texture_reg_select(fp, texture->bias, in_reg_base); + fprintf(fp, ", "); - if (texture->bias_int) - fprintf(fp, " /* bias_int = 0x%X */", texture->bias_int); - } else if (texture->op == midgard_tex_op_fetch) { - /* For texel fetch, the int LOD is in the fractional place and - * there is no fraction. We *always* have an explicit LOD, even - * if it's zero. */ + if (texture->bias_int) + fprintf(fp, " /* bias_int = 0x%X */", texture->bias_int); + } else if (texture->op == midgard_tex_op_fetch) { + /* For texel fetch, the int LOD is in the fractional place and + * there is no fraction. We *always* have an explicit LOD, even + * if it's zero. */ - if (texture->bias_int) - fprintf(fp, " /* bias_int = 0x%X */ ", texture->bias_int); + if (texture->bias_int) + fprintf(fp, " /* bias_int = 0x%X */ ", texture->bias_int); - fprintf(fp, "lod = %u, ", texture->bias); - } else if (texture->bias || texture->bias_int) { - signed bias_int = texture->bias_int; - float bias_frac = texture->bias / 256.0f; - float bias = bias_int + bias_frac; + fprintf(fp, "lod = %u, ", texture->bias); + } else if (texture->bias || texture->bias_int) { + signed bias_int = texture->bias_int; + float bias_frac = texture->bias / 256.0f; + float bias = bias_int + bias_frac; - bool is_bias = texture_op_takes_bias(texture->op); - char sign = (bias >= 0.0) ? '+' : '-'; - char operand = is_bias ? sign : '='; + bool is_bias = texture_op_takes_bias(texture->op); + char sign = (bias >= 0.0) ? '+' : '-'; + char operand = is_bias ? sign : '='; - fprintf(fp, "lod %c %f, ", operand, fabsf(bias)); - } + fprintf(fp, "lod %c %f, ", operand, fabsf(bias)); + } - fprintf(fp, "\n"); + fprintf(fp, "\n"); - /* While not zero in general, for these simple instructions the - * following unknowns are zero, so we don't include them */ + /* While not zero in general, for these simple instructions the + * following unknowns are zero, so we don't include them */ - if (texture->unknown4 || - texture->unknown8) { - fprintf(fp, "// unknown4 = 0x%x\n", texture->unknown4); - fprintf(fp, "// unknown8 = 0x%x\n", texture->unknown8); - } + if (texture->unknown4 || texture->unknown8) { + fprintf(fp, "// unknown4 = 0x%x\n", texture->unknown4); + fprintf(fp, "// unknown8 = 0x%x\n", texture->unknown8); + } } void -disassemble_midgard(FILE *fp, uint8_t *code, size_t size, unsigned gpu_id, bool verbose) +disassemble_midgard(FILE *fp, uint8_t *code, size_t size, unsigned gpu_id, + bool verbose) { - uint32_t *words = (uint32_t *) code; - unsigned num_words = size / 4; - int tabs = 0; + uint32_t *words = (uint32_t *)code; + unsigned num_words = size / 4; + int tabs = 0; - bool branch_forward = false; + bool branch_forward = false; - int last_next_tag = -1; + int last_next_tag = -1; - unsigned i = 0; + unsigned i = 0; - disassemble_context ctx = { - .midg_tags = calloc(sizeof(ctx.midg_tags[0]), num_words), - .midg_ever_written = 0, - }; + disassemble_context ctx = { + .midg_tags = calloc(sizeof(ctx.midg_tags[0]), num_words), + .midg_ever_written = 0, + }; - while (i < num_words) { - unsigned tag = words[i] & 0xF; - unsigned next_tag = (words[i] >> 4) & 0xF; - unsigned num_quad_words = midgard_tag_props[tag].size; + while (i < num_words) { + unsigned tag = words[i] & 0xF; + unsigned next_tag = (words[i] >> 4) & 0xF; + unsigned num_quad_words = midgard_tag_props[tag].size; - if (ctx.midg_tags[i] && ctx.midg_tags[i] != tag) { - fprintf(fp, "\t/* XXX: TAG ERROR branch, got %s expected %s */\n", - midgard_tag_props[tag].name, - midgard_tag_props[ctx.midg_tags[i]].name); - } + if (ctx.midg_tags[i] && ctx.midg_tags[i] != tag) { + fprintf(fp, "\t/* XXX: TAG ERROR branch, got %s expected %s */\n", + midgard_tag_props[tag].name, + midgard_tag_props[ctx.midg_tags[i]].name); + } - ctx.midg_tags[i] = tag; + ctx.midg_tags[i] = tag; - /* Check the tag. The idea is to ensure that next_tag is - * *always* recoverable from the disassembly, such that we may - * safely omit printing next_tag. To show this, we first - * consider that next tags are semantically off-byone -- we end - * up parsing tag n during step n+1. So, we ensure after we're - * done disassembling the next tag of the final bundle is BREAK - * and warn otherwise. We also ensure that the next tag is - * never INVALID. Beyond that, since the last tag is checked - * outside the loop, we can check one tag prior. If equal to - * the current tag (which is unique), we're done. Otherwise, we - * print if that tag was > TAG_BREAK, which implies the tag was - * not TAG_BREAK or TAG_INVALID. But we already checked for - * TAG_INVALID, so it's just if the last tag was TAG_BREAK that - * we're silent. So we throw in a print for break-next on at - * the end of the bundle (if it's not the final bundle, which - * we already check for above), disambiguating this case as - * well. Hence in all cases we are unambiguous, QED. */ + /* Check the tag. The idea is to ensure that next_tag is + * *always* recoverable from the disassembly, such that we may + * safely omit printing next_tag. To show this, we first + * consider that next tags are semantically off-byone -- we end + * up parsing tag n during step n+1. So, we ensure after we're + * done disassembling the next tag of the final bundle is BREAK + * and warn otherwise. We also ensure that the next tag is + * never INVALID. Beyond that, since the last tag is checked + * outside the loop, we can check one tag prior. If equal to + * the current tag (which is unique), we're done. Otherwise, we + * print if that tag was > TAG_BREAK, which implies the tag was + * not TAG_BREAK or TAG_INVALID. But we already checked for + * TAG_INVALID, so it's just if the last tag was TAG_BREAK that + * we're silent. So we throw in a print for break-next on at + * the end of the bundle (if it's not the final bundle, which + * we already check for above), disambiguating this case as + * well. Hence in all cases we are unambiguous, QED. */ - if (next_tag == TAG_INVALID) - fprintf(fp, "\t/* XXX: invalid next tag */\n"); + if (next_tag == TAG_INVALID) + fprintf(fp, "\t/* XXX: invalid next tag */\n"); - if (last_next_tag > TAG_BREAK && last_next_tag != tag) { - fprintf(fp, "\t/* XXX: TAG ERROR sequence, got %s expexted %s */\n", - midgard_tag_props[tag].name, - midgard_tag_props[last_next_tag].name); - } + if (last_next_tag > TAG_BREAK && last_next_tag != tag) { + fprintf(fp, "\t/* XXX: TAG ERROR sequence, got %s expexted %s */\n", + midgard_tag_props[tag].name, + midgard_tag_props[last_next_tag].name); + } - last_next_tag = next_tag; + last_next_tag = next_tag; - /* Tags are unique in the following way: - * - * INVALID, BREAK, UNKNOWN_*: verbosely printed - * TEXTURE_4_BARRIER: verified by barrier/!barrier op - * TEXTURE_4_VTX: .vtx tag printed - * TEXTURE_4: tetxure lack of barriers or .vtx - * TAG_LOAD_STORE_4: only load/store - * TAG_ALU_4/8/12/16: by number of instructions/constants - * TAG_ALU_4_8/12/16_WRITEOUT: ^^ with .writeout tag - */ + /* Tags are unique in the following way: + * + * INVALID, BREAK, UNKNOWN_*: verbosely printed + * TEXTURE_4_BARRIER: verified by barrier/!barrier op + * TEXTURE_4_VTX: .vtx tag printed + * TEXTURE_4: tetxure lack of barriers or .vtx + * TAG_LOAD_STORE_4: only load/store + * TAG_ALU_4/8/12/16: by number of instructions/constants + * TAG_ALU_4_8/12/16_WRITEOUT: ^^ with .writeout tag + */ - switch (tag) { - case TAG_TEXTURE_4_VTX ... TAG_TEXTURE_4_BARRIER: { - bool interpipe_aliasing = - midgard_get_quirks(gpu_id) & MIDGARD_INTERPIPE_REG_ALIASING; + switch (tag) { + case TAG_TEXTURE_4_VTX ... TAG_TEXTURE_4_BARRIER: { + bool interpipe_aliasing = + midgard_get_quirks(gpu_id) & MIDGARD_INTERPIPE_REG_ALIASING; - print_texture_word(&ctx, fp, &words[i], tabs, - interpipe_aliasing ? 0 : REG_TEX_BASE, - interpipe_aliasing ? REGISTER_LDST_BASE : REG_TEX_BASE); - break; - } + print_texture_word( + &ctx, fp, &words[i], tabs, interpipe_aliasing ? 0 : REG_TEX_BASE, + interpipe_aliasing ? REGISTER_LDST_BASE : REG_TEX_BASE); + break; + } - case TAG_LOAD_STORE_4: - print_load_store_word(&ctx, fp, &words[i], verbose); - break; + case TAG_LOAD_STORE_4: + print_load_store_word(&ctx, fp, &words[i], verbose); + break; - case TAG_ALU_4 ... TAG_ALU_16_WRITEOUT: - branch_forward = print_alu_word(&ctx, fp, &words[i], num_quad_words, tabs, i + 4*num_quad_words, verbose); + case TAG_ALU_4 ... TAG_ALU_16_WRITEOUT: + branch_forward = print_alu_word(&ctx, fp, &words[i], num_quad_words, + tabs, i + 4 * num_quad_words, verbose); - /* TODO: infer/verify me */ - if (tag >= TAG_ALU_4_WRITEOUT) - fprintf(fp, "writeout\n"); + /* TODO: infer/verify me */ + if (tag >= TAG_ALU_4_WRITEOUT) + fprintf(fp, "writeout\n"); - break; + break; - default: - fprintf(fp, "Unknown word type %u:\n", words[i] & 0xF); - num_quad_words = 1; - print_quad_word(fp, &words[i], tabs); - fprintf(fp, "\n"); - break; - } + default: + fprintf(fp, "Unknown word type %u:\n", words[i] & 0xF); + num_quad_words = 1; + print_quad_word(fp, &words[i], tabs); + fprintf(fp, "\n"); + break; + } - /* Include a synthetic "break" instruction at the end of the - * bundle to signify that if, absent a branch, the shader - * execution will stop here. Stop disassembly at such a break - * based on a heuristic */ + /* Include a synthetic "break" instruction at the end of the + * bundle to signify that if, absent a branch, the shader + * execution will stop here. Stop disassembly at such a break + * based on a heuristic */ - if (next_tag == TAG_BREAK) { - if (branch_forward) { - fprintf(fp, "break\n"); - } else { - fprintf(fp, "\n"); - break; - } - } + if (next_tag == TAG_BREAK) { + if (branch_forward) { + fprintf(fp, "break\n"); + } else { + fprintf(fp, "\n"); + break; + } + } - fprintf(fp, "\n"); + fprintf(fp, "\n"); - i += 4 * num_quad_words; - } + i += 4 * num_quad_words; + } - if (last_next_tag != TAG_BREAK) { - fprintf(fp, "/* XXX: shader ended with tag %s */\n", - midgard_tag_props[last_next_tag].name); - } + if (last_next_tag != TAG_BREAK) { + fprintf(fp, "/* XXX: shader ended with tag %s */\n", + midgard_tag_props[last_next_tag].name); + } - free(ctx.midg_tags); + free(ctx.midg_tags); } diff --git a/src/panfrost/midgard/disassemble.h b/src/panfrost/midgard/disassemble.h index 6aaaf8c6bc5..7145c7c47a7 100644 --- a/src/panfrost/midgard/disassemble.h +++ b/src/panfrost/midgard/disassemble.h @@ -1,7 +1,7 @@ -#include -#include #include #include +#include +#include -void -disassemble_midgard(FILE *fp, uint8_t *code, size_t size, unsigned gpu_id, bool verbose); +void disassemble_midgard(FILE *fp, uint8_t *code, size_t size, unsigned gpu_id, + bool verbose); diff --git a/src/panfrost/midgard/helpers.h b/src/panfrost/midgard/helpers.h index 436641b5201..f2161c7a650 100644 --- a/src/panfrost/midgard/helpers.h +++ b/src/panfrost/midgard/helpers.h @@ -25,98 +25,74 @@ #include #include -#include "midgard.h" #include "util/macros.h" +#include "midgard.h" -#define OP_IS_LOAD_VARY_F(op) (\ - op == midgard_op_ld_vary_16 || \ - op == midgard_op_ld_vary_32 \ - ) +#define OP_IS_LOAD_VARY_F(op) \ + (op == midgard_op_ld_vary_16 || op == midgard_op_ld_vary_32) -#define OP_IS_PROJECTION(op) ( \ - op == midgard_op_ldst_perspective_div_y || \ - op == midgard_op_ldst_perspective_div_z || \ - op == midgard_op_ldst_perspective_div_w \ - ) +#define OP_IS_PROJECTION(op) \ + (op == midgard_op_ldst_perspective_div_y || \ + op == midgard_op_ldst_perspective_div_z || \ + op == midgard_op_ldst_perspective_div_w) -#define OP_IS_VEC4_ONLY(op) ( \ - OP_IS_PROJECTION(op) || \ - op == midgard_op_ld_cubemap_coords \ - ) +#define OP_IS_VEC4_ONLY(op) \ + (OP_IS_PROJECTION(op) || op == midgard_op_ld_cubemap_coords) -#define OP_IS_MOVE(op) ( \ - (op >= midgard_alu_op_fmov && op <= midgard_alu_op_fmov_rtp) || \ - op == midgard_alu_op_imov \ - ) +#define OP_IS_MOVE(op) \ + ((op >= midgard_alu_op_fmov && op <= midgard_alu_op_fmov_rtp) || \ + op == midgard_alu_op_imov) -#define OP_IS_UBO_READ(op) ( \ - op >= midgard_op_ld_ubo_u8 && \ - op <= midgard_op_ld_ubo_128_bswap8 \ - ) +#define OP_IS_UBO_READ(op) \ + (op >= midgard_op_ld_ubo_u8 && op <= midgard_op_ld_ubo_128_bswap8) -#define OP_IS_CSEL_V(op) ( \ - op == midgard_alu_op_icsel_v || \ - op == midgard_alu_op_fcsel_v \ - ) +#define OP_IS_CSEL_V(op) \ + (op == midgard_alu_op_icsel_v || op == midgard_alu_op_fcsel_v) -#define OP_IS_CSEL(op) ( \ - OP_IS_CSEL_V(op) || \ - op == midgard_alu_op_icsel || \ - op == midgard_alu_op_fcsel \ - ) +#define OP_IS_CSEL(op) \ + (OP_IS_CSEL_V(op) || op == midgard_alu_op_icsel || \ + op == midgard_alu_op_fcsel) -#define OP_IS_UNSIGNED_CMP(op) ( \ - op == midgard_alu_op_ult || \ - op == midgard_alu_op_ule \ - ) +#define OP_IS_UNSIGNED_CMP(op) \ + (op == midgard_alu_op_ult || op == midgard_alu_op_ule) -#define OP_IS_INTEGER_CMP(op) ( \ - op == midgard_alu_op_ieq || \ - op == midgard_alu_op_ine || \ - op == midgard_alu_op_ilt || \ - op == midgard_alu_op_ile || \ - OP_IS_UNSIGNED_CMP(op) \ - ) +#define OP_IS_INTEGER_CMP(op) \ + (op == midgard_alu_op_ieq || op == midgard_alu_op_ine || \ + op == midgard_alu_op_ilt || op == midgard_alu_op_ile || \ + OP_IS_UNSIGNED_CMP(op)) -#define OP_IS_COMMON_STORE(op) ( \ - op >= midgard_op_st_u8 && \ - op <= midgard_op_st_128_bswap8 \ - ) +#define OP_IS_COMMON_STORE(op) \ + (op >= midgard_op_st_u8 && op <= midgard_op_st_128_bswap8) -#define OP_IS_IMAGE(op) ( \ - (op >= midgard_op_ld_image_32f && op <= midgard_op_ld_image_32i) || \ - (op >= midgard_op_st_image_32f && op <= midgard_op_st_image_32i) || \ - op == midgard_op_lea_image \ - ) +#define OP_IS_IMAGE(op) \ + ((op >= midgard_op_ld_image_32f && op <= midgard_op_ld_image_32i) || \ + (op >= midgard_op_st_image_32f && op <= midgard_op_st_image_32i) || \ + op == midgard_op_lea_image) -#define OP_IS_SPECIAL(op) ( \ - (op >= midgard_op_ld_special_32f && op <= midgard_op_ld_special_32i) || \ - (op >= midgard_op_st_special_32f && op <= midgard_op_st_special_32i) \ - ) +#define OP_IS_SPECIAL(op) \ + ((op >= midgard_op_ld_special_32f && op <= midgard_op_ld_special_32i) || \ + (op >= midgard_op_st_special_32f && op <= midgard_op_st_special_32i)) -#define OP_IS_PACK_COLOUR(op) ( \ - (op >= midgard_op_pack_colour_f32 && op <= midgard_op_pack_colour_s32) \ - ) +#define OP_IS_PACK_COLOUR(op) \ + ((op >= midgard_op_pack_colour_f32 && op <= midgard_op_pack_colour_s32)) -#define OP_IS_UNPACK_COLOUR(op) ( \ - (op >= midgard_op_unpack_colour_f32 && op <= midgard_op_unpack_colour_s32) \ - ) +#define OP_IS_UNPACK_COLOUR(op) \ + ((op >= midgard_op_unpack_colour_f32 && op <= midgard_op_unpack_colour_s32)) /* Instructions that are on the load/store unit but don't access memory */ -#define OP_IS_REG2REG_LDST(op) ( \ - op >= midgard_op_unpack_colour_f32 && \ - op <= midgard_op_ldst_perspective_div_w \ - ) +#define OP_IS_REG2REG_LDST(op) \ + (op >= midgard_op_unpack_colour_f32 && \ + op <= midgard_op_ldst_perspective_div_w) /* ALU control words are single bit fields with a lot of space */ -#define ALU_ENAB_VEC_MUL (1 << 17) -#define ALU_ENAB_SCAL_ADD (1 << 19) -#define ALU_ENAB_VEC_ADD (1 << 21) -#define ALU_ENAB_SCAL_MUL (1 << 23) -#define ALU_ENAB_VEC_LUT (1 << 25) +#define ALU_ENAB_VEC_MUL (1 << 17) +#define ALU_ENAB_SCAL_ADD (1 << 19) +#define ALU_ENAB_VEC_ADD (1 << 21) +#define ALU_ENAB_SCAL_MUL (1 << 23) +#define ALU_ENAB_VEC_LUT (1 << 25) #define ALU_ENAB_BR_COMPACT (1 << 26) -#define ALU_ENAB_BRANCH (1 << 27) +#define ALU_ENAB_BRANCH (1 << 27) /* Other opcode properties that don't conflict with the ALU_ENABs, non-ISA */ @@ -128,7 +104,7 @@ * make sense (since then why are we quirked?), so that corresponds to "no * count set" */ -#define OP_CHANNEL_COUNT(c) ((c - 1) << 0) +#define OP_CHANNEL_COUNT(c) ((c - 1) << 0) #define GET_CHANNEL_COUNT(c) ((c & (0x3 << 0)) ? ((c & (0x3 << 0)) + 1) : 0) /* For instructions that take a single argument, normally the first argument @@ -171,11 +147,11 @@ /* r24 and r25 are special registers that only exist during the pipeline, * by using them when we don't care about the register we skip a roundtrip * to the register file. */ -#define REGISTER_UNUSED 24 -#define REGISTER_CONSTANT 26 -#define REGISTER_LDST_BASE 26 +#define REGISTER_UNUSED 24 +#define REGISTER_CONSTANT 26 +#define REGISTER_LDST_BASE 26 #define REGISTER_TEXTURE_BASE 28 -#define REGISTER_SELECT 31 +#define REGISTER_SELECT 31 /* The following registers are read-only */ @@ -185,8 +161,8 @@ /* XY is Thread Local Storage pointer, ZW is Workgroup Local Storage pointer */ #define REGISTER_LDST_LOCAL_STORAGE_PTR 3 -#define REGISTER_LDST_LOCAL_THREAD_ID 4 -#define REGISTER_LDST_GROUP_ID 5 +#define REGISTER_LDST_LOCAL_THREAD_ID 4 +#define REGISTER_LDST_GROUP_ID 5 #define REGISTER_LDST_GLOBAL_THREAD_ID 6 /* This register is always zeroed when read. */ @@ -194,34 +170,38 @@ /* SSA helper aliases to mimic the registers. */ -#define SSA_FIXED_SHIFT 24 +#define SSA_FIXED_SHIFT 24 #define SSA_FIXED_REGISTER(reg) (((1 + (reg)) << SSA_FIXED_SHIFT) | 1) #define SSA_REG_FROM_FIXED(reg) ((((reg) & ~1) >> SSA_FIXED_SHIFT) - 1) -#define SSA_FIXED_MINIMUM SSA_FIXED_REGISTER(0) +#define SSA_FIXED_MINIMUM SSA_FIXED_REGISTER(0) #define COMPONENT_X 0x0 #define COMPONENT_Y 0x1 #define COMPONENT_Z 0x2 #define COMPONENT_W 0x3 -#define SWIZZLE_IDENTITY { \ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, \ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, \ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, \ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } \ -} +#define SWIZZLE_IDENTITY \ + { \ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, \ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, \ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, \ + { \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 \ + } \ + } -#define SWIZZLE_IDENTITY_4 { \ - { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, \ - { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, \ - { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, \ - { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, \ -} +#define SWIZZLE_IDENTITY_4 \ + { \ + {0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ + {0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ + {0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ + {0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ + } static inline unsigned mask_of(unsigned nr_comp) { - return (1 << nr_comp) - 1; + return (1 << nr_comp) - 1; } /* See ISA notes */ @@ -242,34 +222,34 @@ mask_of(unsigned nr_comp) /* Shorthands for usual combinations of units */ -#define UNITS_MUL (UNIT_VMUL | UNIT_SMUL) -#define UNITS_ADD (UNIT_VADD | UNIT_SADD) -#define UNITS_MOST (UNITS_MUL | UNITS_ADD) -#define UNITS_ALL (UNITS_MOST | UNIT_VLUT) -#define UNITS_SCALAR (UNIT_SADD | UNIT_SMUL) -#define UNITS_VECTOR (UNIT_VMUL | UNIT_VADD) +#define UNITS_MUL (UNIT_VMUL | UNIT_SMUL) +#define UNITS_ADD (UNIT_VADD | UNIT_SADD) +#define UNITS_MOST (UNITS_MUL | UNITS_ADD) +#define UNITS_ALL (UNITS_MOST | UNIT_VLUT) +#define UNITS_SCALAR (UNIT_SADD | UNIT_SMUL) +#define UNITS_VECTOR (UNIT_VMUL | UNIT_VADD) #define UNITS_ANY_VECTOR (UNITS_VECTOR | UNIT_VLUT) struct mir_op_props { - const char *name; - unsigned props; + const char *name; + unsigned props; }; /* For load/store */ struct mir_ldst_op_props { - const char *name; - unsigned props; + const char *name; + unsigned props; }; struct mir_tex_op_props { - const char *name; - unsigned props; + const char *name; + unsigned props; }; struct mir_tag_props { - const char *name; - unsigned size; + const char *name; + unsigned size; }; /* Lower 2-bits are a midgard_reg_mode */ @@ -303,15 +283,15 @@ struct mir_tag_props { static inline unsigned expand_writemask(unsigned mask, unsigned log2_channels) { - unsigned o = 0; - unsigned factor = 8 >> log2_channels; - unsigned expanded = (1 << factor) - 1; + unsigned o = 0; + unsigned factor = 8 >> log2_channels; + unsigned expanded = (1 << factor) - 1; - for (unsigned i = 0; i < (1 << log2_channels); ++i) - if (mask & (1 << i)) - o |= (expanded << (factor * i)); + for (unsigned i = 0; i < (1 << log2_channels); ++i) + if (mask & (1 << i)) + o |= (expanded << (factor * i)); - return o; + return o; } /* Coerce structs to integer */ @@ -319,28 +299,28 @@ expand_writemask(unsigned mask, unsigned log2_channels) static inline unsigned vector_alu_srco_unsigned(midgard_vector_alu_src src) { - unsigned u; - memcpy(&u, &src, sizeof(src)); - return u; + unsigned u; + memcpy(&u, &src, sizeof(src)); + return u; } static inline midgard_vector_alu_src vector_alu_from_unsigned(unsigned u) { - midgard_vector_alu_src s; - memcpy(&s, &u, sizeof(s)); - return s; + midgard_vector_alu_src s; + memcpy(&s, &u, sizeof(s)); + return s; } static inline void mir_compose_swizzle(unsigned *left, unsigned *right, unsigned *final_out) { - unsigned out[16]; + unsigned out[16]; - for (unsigned c = 0; c < 16; ++c) - out[c] = right[left[c]]; + for (unsigned c = 0; c < 16; ++c) + out[c] = right[left[c]]; - memcpy(final_out, out, sizeof(out)); + memcpy(final_out, out, sizeof(out)); } /* Checks for an xyzw.. swizzle, given a mask */ @@ -348,14 +328,15 @@ mir_compose_swizzle(unsigned *left, unsigned *right, unsigned *final_out) static inline bool mir_is_simple_swizzle(unsigned *swizzle, unsigned mask) { - for (unsigned i = 0; i < 16; ++i) { - if (!(mask & (1 << i))) continue; + for (unsigned i = 0; i < 16; ++i) { + if (!(mask & (1 << i))) + continue; - if (swizzle[i] != i) - return false; - } + if (swizzle[i] != i) + return false; + } - return true; + return true; } /* Packs a load/store argument */ @@ -363,19 +344,19 @@ mir_is_simple_swizzle(unsigned *swizzle, unsigned mask) static inline uint8_t midgard_ldst_comp(unsigned reg, unsigned component, unsigned size) { - assert((reg & ~1) == 0); - assert(size == 16 || size == 32 || size == 64); + assert((reg & ~1) == 0); + assert(size == 16 || size == 32 || size == 64); - /* Shift so everything is in terms of 32-bit units */ - if (size == 64) { - assert(component < 2); - component <<= 1; - } else if (size == 16) { - assert((component & 1) == 0); - component >>= 1; - } + /* Shift so everything is in terms of 32-bit units */ + if (size == 64) { + assert(component < 2); + component <<= 1; + } else if (size == 16) { + assert((component & 1) == 0); + component >>= 1; + } - return component; + return component; } /* Packs/unpacks a ubo index immediate. The unpack must be defined here so it @@ -388,55 +369,52 @@ void midgard_pack_ubo_index_imm(midgard_load_store_word *word, unsigned index); static inline unsigned midgard_unpack_ubo_index_imm(midgard_load_store_word word) { - unsigned ubo = word.arg_comp | - (word.arg_reg << 2) | - (word.bitsize_toggle << 5) | - (word.index_format << 6); + unsigned ubo = word.arg_comp | (word.arg_reg << 2) | + (word.bitsize_toggle << 5) | (word.index_format << 6); - return ubo; + return ubo; } - /* Packs/unpacks varying parameters. * FIXME: IMPORTANT: We currently handle varying mode weirdly, by passing all * parameters via an offset and using REGISTER_LDST_ZERO as base. This works * for most parameters, but does not allow us to encode/decode direct sample * position. */ -void midgard_pack_varying_params(midgard_load_store_word *word, midgard_varying_params p); -midgard_varying_params midgard_unpack_varying_params(midgard_load_store_word word); +void midgard_pack_varying_params(midgard_load_store_word *word, + midgard_varying_params p); +midgard_varying_params +midgard_unpack_varying_params(midgard_load_store_word word); /* Load/store ops' displacement helpers. * This is useful because different types of load/store ops have different * displacement bitsize. */ -#define UNPACK_LDST_ATTRIB_OFS(a) ((a) >> 9) -#define UNPACK_LDST_VERTEX_OFS(a) util_sign_extend((a) & 0x1FF, 9) +#define UNPACK_LDST_ATTRIB_OFS(a) ((a) >> 9) +#define UNPACK_LDST_VERTEX_OFS(a) util_sign_extend((a)&0x1FF, 9) #define UNPACK_LDST_SELECTOR_OFS(a) ((a) >> 9) -#define UNPACK_LDST_UBO_OFS(a) ((a) >> 2) -#define UNPACK_LDST_MEM_OFS(a) ((a)) +#define UNPACK_LDST_UBO_OFS(a) ((a) >> 2) +#define UNPACK_LDST_MEM_OFS(a) ((a)) -#define PACK_LDST_ATTRIB_OFS(a) ((a) << 9) -#define PACK_LDST_VERTEX_OFS(a) ((a) & 0x1FF) +#define PACK_LDST_ATTRIB_OFS(a) ((a) << 9) +#define PACK_LDST_VERTEX_OFS(a) ((a)&0x1FF) #define PACK_LDST_SELECTOR_OFS(a) ((a) << 9) -#define PACK_LDST_UBO_OFS(a) ((a) << 2) -#define PACK_LDST_MEM_OFS(a) ((a)) +#define PACK_LDST_UBO_OFS(a) ((a) << 2) +#define PACK_LDST_MEM_OFS(a) ((a)) static inline bool midgard_is_branch_unit(unsigned unit) { - return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT); + return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT); } /* Packs ALU mod argument */ struct midgard_instruction; unsigned mir_pack_mod(struct midgard_instruction *ins, unsigned i, bool scalar); -void -mir_print_constant_component(FILE *fp, const midgard_constants *consts, - unsigned c, midgard_reg_mode reg_mode, bool half, - unsigned mod, midgard_alu_op op); +void mir_print_constant_component(FILE *fp, const midgard_constants *consts, + unsigned c, midgard_reg_mode reg_mode, + bool half, unsigned mod, midgard_alu_op op); -void -mir_print_outmod(FILE *fp, unsigned outmod, bool is_int); +void mir_print_outmod(FILE *fp, unsigned outmod, bool is_int); #endif diff --git a/src/panfrost/midgard/midgard.h b/src/panfrost/midgard/midgard.h index 2121dc4790b..89772854ba8 100644 --- a/src/panfrost/midgard/midgard.h +++ b/src/panfrost/midgard/midgard.h @@ -28,49 +28,49 @@ #ifndef __midgard_h__ #define __midgard_h__ -#include #include +#include -#define MIDGARD_DBG_MSGS 0x0001 -#define MIDGARD_DBG_SHADERS 0x0002 -#define MIDGARD_DBG_SHADERDB 0x0004 -#define MIDGARD_DBG_INORDER 0x0008 -#define MIDGARD_DBG_VERBOSE 0x0010 -#define MIDGARD_DBG_INTERNAL 0x0020 +#define MIDGARD_DBG_MSGS 0x0001 +#define MIDGARD_DBG_SHADERS 0x0002 +#define MIDGARD_DBG_SHADERDB 0x0004 +#define MIDGARD_DBG_INORDER 0x0008 +#define MIDGARD_DBG_VERBOSE 0x0010 +#define MIDGARD_DBG_INTERNAL 0x0020 extern int midgard_debug; typedef enum { - midgard_word_type_alu, - midgard_word_type_load_store, - midgard_word_type_texture + midgard_word_type_alu, + midgard_word_type_load_store, + midgard_word_type_texture } midgard_word_type; typedef enum { - midgard_alu_vmul, - midgard_alu_sadd, - midgard_alu_smul, - midgard_alu_vadd, - midgard_alu_lut + midgard_alu_vmul, + midgard_alu_sadd, + midgard_alu_smul, + midgard_alu_vadd, + midgard_alu_lut } midgard_alu; enum { - TAG_INVALID = 0x0, - TAG_BREAK = 0x1, - TAG_TEXTURE_4_VTX = 0x2, - TAG_TEXTURE_4 = 0x3, - TAG_TEXTURE_4_BARRIER = 0x4, - TAG_LOAD_STORE_4 = 0x5, - TAG_UNKNOWN_1 = 0x6, - TAG_UNKNOWN_2 = 0x7, - TAG_ALU_4 = 0x8, - TAG_ALU_8 = 0x9, - TAG_ALU_12 = 0xA, - TAG_ALU_16 = 0xB, - TAG_ALU_4_WRITEOUT = 0xC, - TAG_ALU_8_WRITEOUT = 0xD, - TAG_ALU_12_WRITEOUT = 0xE, - TAG_ALU_16_WRITEOUT = 0xF + TAG_INVALID = 0x0, + TAG_BREAK = 0x1, + TAG_TEXTURE_4_VTX = 0x2, + TAG_TEXTURE_4 = 0x3, + TAG_TEXTURE_4_BARRIER = 0x4, + TAG_LOAD_STORE_4 = 0x5, + TAG_UNKNOWN_1 = 0x6, + TAG_UNKNOWN_2 = 0x7, + TAG_ALU_4 = 0x8, + TAG_ALU_8 = 0x9, + TAG_ALU_12 = 0xA, + TAG_ALU_16 = 0xB, + TAG_ALU_4_WRITEOUT = 0xC, + TAG_ALU_8_WRITEOUT = 0xD, + TAG_ALU_12_WRITEOUT = 0xE, + TAG_ALU_16_WRITEOUT = 0xF }; /* @@ -78,200 +78,202 @@ enum { */ typedef enum { - midgard_alu_op_fadd = 0x10, /* round to even */ - midgard_alu_op_fadd_rtz = 0x11, - midgard_alu_op_fadd_rtn = 0x12, - midgard_alu_op_fadd_rtp = 0x13, - midgard_alu_op_fmul = 0x14, /* round to even */ - midgard_alu_op_fmul_rtz = 0x15, - midgard_alu_op_fmul_rtn = 0x16, - midgard_alu_op_fmul_rtp = 0x17, + midgard_alu_op_fadd = 0x10, /* round to even */ + midgard_alu_op_fadd_rtz = 0x11, + midgard_alu_op_fadd_rtn = 0x12, + midgard_alu_op_fadd_rtp = 0x13, + midgard_alu_op_fmul = 0x14, /* round to even */ + midgard_alu_op_fmul_rtz = 0x15, + midgard_alu_op_fmul_rtn = 0x16, + midgard_alu_op_fmul_rtp = 0x17, - midgard_alu_op_fmin = 0x28, /* if an operand is NaN, propagate the other */ - midgard_alu_op_fmin_nan = 0x29, /* if an operand is NaN, propagate it */ - midgard_alu_op_fabsmin = 0x2A, /* min(abs(a,b)) */ - midgard_alu_op_fabsmin_nan = 0x2B, /* min_nan(abs(a,b)) */ - midgard_alu_op_fmax = 0x2C, /* if an operand is NaN, propagate the other */ - midgard_alu_op_fmax_nan = 0x2D, /* if an operand is NaN, propagate it */ - midgard_alu_op_fabsmax = 0x2E, /* max(abs(a,b)) */ - midgard_alu_op_fabsmax_nan = 0x2F, /* max_nan(abs(a,b)) */ + midgard_alu_op_fmin = 0x28, /* if an operand is NaN, propagate the other */ + midgard_alu_op_fmin_nan = 0x29, /* if an operand is NaN, propagate it */ + midgard_alu_op_fabsmin = 0x2A, /* min(abs(a,b)) */ + midgard_alu_op_fabsmin_nan = 0x2B, /* min_nan(abs(a,b)) */ + midgard_alu_op_fmax = 0x2C, /* if an operand is NaN, propagate the other */ + midgard_alu_op_fmax_nan = 0x2D, /* if an operand is NaN, propagate it */ + midgard_alu_op_fabsmax = 0x2E, /* max(abs(a,b)) */ + midgard_alu_op_fabsmax_nan = 0x2F, /* max_nan(abs(a,b)) */ - midgard_alu_op_fmov = 0x30, /* fmov_rte */ - midgard_alu_op_fmov_rtz = 0x31, - midgard_alu_op_fmov_rtn = 0x32, - midgard_alu_op_fmov_rtp = 0x33, - midgard_alu_op_froundeven = 0x34, - midgard_alu_op_ftrunc = 0x35, - midgard_alu_op_ffloor = 0x36, - midgard_alu_op_fceil = 0x37, - midgard_alu_op_ffma = 0x38, /* rte */ - midgard_alu_op_ffma_rtz = 0x39, - midgard_alu_op_ffma_rtn = 0x3A, - midgard_alu_op_ffma_rtp = 0x3B, - midgard_alu_op_fdot3 = 0x3C, - midgard_alu_op_fdot3r = 0x3D, - midgard_alu_op_fdot4 = 0x3E, - midgard_alu_op_freduce = 0x3F, + midgard_alu_op_fmov = 0x30, /* fmov_rte */ + midgard_alu_op_fmov_rtz = 0x31, + midgard_alu_op_fmov_rtn = 0x32, + midgard_alu_op_fmov_rtp = 0x33, + midgard_alu_op_froundeven = 0x34, + midgard_alu_op_ftrunc = 0x35, + midgard_alu_op_ffloor = 0x36, + midgard_alu_op_fceil = 0x37, + midgard_alu_op_ffma = 0x38, /* rte */ + midgard_alu_op_ffma_rtz = 0x39, + midgard_alu_op_ffma_rtn = 0x3A, + midgard_alu_op_ffma_rtp = 0x3B, + midgard_alu_op_fdot3 = 0x3C, + midgard_alu_op_fdot3r = 0x3D, + midgard_alu_op_fdot4 = 0x3E, + midgard_alu_op_freduce = 0x3F, - midgard_alu_op_iadd = 0x40, - midgard_alu_op_ishladd = 0x41, /* (a<<1) + b */ - midgard_alu_op_isub = 0x46, - midgard_alu_op_ishlsub = 0x47, /* (a<<1) - b */ - midgard_alu_op_iaddsat = 0x48, - midgard_alu_op_uaddsat = 0x49, - midgard_alu_op_isubsat = 0x4E, - midgard_alu_op_usubsat = 0x4F, + midgard_alu_op_iadd = 0x40, + midgard_alu_op_ishladd = 0x41, /* (a<<1) + b */ + midgard_alu_op_isub = 0x46, + midgard_alu_op_ishlsub = 0x47, /* (a<<1) - b */ + midgard_alu_op_iaddsat = 0x48, + midgard_alu_op_uaddsat = 0x49, + midgard_alu_op_isubsat = 0x4E, + midgard_alu_op_usubsat = 0x4F, - midgard_alu_op_imul = 0x58, - /* Multiplies two ints and stores the result in the next larger datasize. */ - midgard_alu_op_iwmul = 0x59, /* sint * sint = sint */ - midgard_alu_op_uwmul = 0x5A, /* uint * uint = uint */ - midgard_alu_op_iuwmul = 0x5B, /* sint * uint = sint */ + midgard_alu_op_imul = 0x58, + /* Multiplies two ints and stores the result in the next larger datasize. */ + midgard_alu_op_iwmul = 0x59, /* sint * sint = sint */ + midgard_alu_op_uwmul = 0x5A, /* uint * uint = uint */ + midgard_alu_op_iuwmul = 0x5B, /* sint * uint = sint */ - midgard_alu_op_imin = 0x60, - midgard_alu_op_umin = 0x61, - midgard_alu_op_imax = 0x62, - midgard_alu_op_umax = 0x63, - midgard_alu_op_iavg = 0x64, - midgard_alu_op_uavg = 0x65, - midgard_alu_op_iravg = 0x66, - midgard_alu_op_uravg = 0x67, - midgard_alu_op_iasr = 0x68, - midgard_alu_op_ilsr = 0x69, - midgard_alu_op_ishlsat = 0x6C, - midgard_alu_op_ushlsat = 0x6D, - midgard_alu_op_ishl = 0x6E, + midgard_alu_op_imin = 0x60, + midgard_alu_op_umin = 0x61, + midgard_alu_op_imax = 0x62, + midgard_alu_op_umax = 0x63, + midgard_alu_op_iavg = 0x64, + midgard_alu_op_uavg = 0x65, + midgard_alu_op_iravg = 0x66, + midgard_alu_op_uravg = 0x67, + midgard_alu_op_iasr = 0x68, + midgard_alu_op_ilsr = 0x69, + midgard_alu_op_ishlsat = 0x6C, + midgard_alu_op_ushlsat = 0x6D, + midgard_alu_op_ishl = 0x6E, - midgard_alu_op_iand = 0x70, - midgard_alu_op_ior = 0x71, - midgard_alu_op_inand = 0x72, /* ~(a & b), for inot let a = b */ - midgard_alu_op_inor = 0x73, /* ~(a | b) */ - midgard_alu_op_iandnot = 0x74, /* (a & ~b), used for not/b2f */ - midgard_alu_op_iornot = 0x75, /* (a | ~b) */ - midgard_alu_op_ixor = 0x76, - midgard_alu_op_inxor = 0x77, /* ~(a ^ b) */ - midgard_alu_op_iclz = 0x78, /* Number of zeroes on left */ - midgard_alu_op_ipopcnt = 0x7A, /* Population count */ - midgard_alu_op_imov = 0x7B, - midgard_alu_op_iabsdiff = 0x7C, - midgard_alu_op_uabsdiff = 0x7D, - midgard_alu_op_ichoose = 0x7E, /* vector, component number - dupe for shuffle() */ + midgard_alu_op_iand = 0x70, + midgard_alu_op_ior = 0x71, + midgard_alu_op_inand = 0x72, /* ~(a & b), for inot let a = b */ + midgard_alu_op_inor = 0x73, /* ~(a | b) */ + midgard_alu_op_iandnot = 0x74, /* (a & ~b), used for not/b2f */ + midgard_alu_op_iornot = 0x75, /* (a | ~b) */ + midgard_alu_op_ixor = 0x76, + midgard_alu_op_inxor = 0x77, /* ~(a ^ b) */ + midgard_alu_op_iclz = 0x78, /* Number of zeroes on left */ + midgard_alu_op_ipopcnt = 0x7A, /* Population count */ + midgard_alu_op_imov = 0x7B, + midgard_alu_op_iabsdiff = 0x7C, + midgard_alu_op_uabsdiff = 0x7D, + midgard_alu_op_ichoose = + 0x7E, /* vector, component number - dupe for shuffle() */ - midgard_alu_op_feq = 0x80, - midgard_alu_op_fne = 0x81, - midgard_alu_op_flt = 0x82, - midgard_alu_op_fle = 0x83, - midgard_alu_op_fball_eq = 0x88, - midgard_alu_op_fball_neq = 0x89, - midgard_alu_op_fball_lt = 0x8A, /* all(lessThan(.., ..)) */ - midgard_alu_op_fball_lte = 0x8B, /* all(lessThanEqual(.., ..)) */ + midgard_alu_op_feq = 0x80, + midgard_alu_op_fne = 0x81, + midgard_alu_op_flt = 0x82, + midgard_alu_op_fle = 0x83, + midgard_alu_op_fball_eq = 0x88, + midgard_alu_op_fball_neq = 0x89, + midgard_alu_op_fball_lt = 0x8A, /* all(lessThan(.., ..)) */ + midgard_alu_op_fball_lte = 0x8B, /* all(lessThanEqual(.., ..)) */ - midgard_alu_op_fbany_eq = 0x90, - midgard_alu_op_fbany_neq = 0x91, - midgard_alu_op_fbany_lt = 0x92, /* any(lessThan(.., ..)) */ - midgard_alu_op_fbany_lte = 0x93, /* any(lessThanEqual(.., ..)) */ + midgard_alu_op_fbany_eq = 0x90, + midgard_alu_op_fbany_neq = 0x91, + midgard_alu_op_fbany_lt = 0x92, /* any(lessThan(.., ..)) */ + midgard_alu_op_fbany_lte = 0x93, /* any(lessThanEqual(.., ..)) */ - midgard_alu_op_f2i_rte = 0x98, - midgard_alu_op_f2i_rtz = 0x99, - midgard_alu_op_f2i_rtn = 0x9A, - midgard_alu_op_f2i_rtp = 0x9B, - midgard_alu_op_f2u_rte = 0x9C, - midgard_alu_op_f2u_rtz = 0x9D, - midgard_alu_op_f2u_rtn = 0x9E, - midgard_alu_op_f2u_rtp = 0x9F, + midgard_alu_op_f2i_rte = 0x98, + midgard_alu_op_f2i_rtz = 0x99, + midgard_alu_op_f2i_rtn = 0x9A, + midgard_alu_op_f2i_rtp = 0x9B, + midgard_alu_op_f2u_rte = 0x9C, + midgard_alu_op_f2u_rtz = 0x9D, + midgard_alu_op_f2u_rtn = 0x9E, + midgard_alu_op_f2u_rtp = 0x9F, - midgard_alu_op_ieq = 0xA0, - midgard_alu_op_ine = 0xA1, - midgard_alu_op_ult = 0xA2, - midgard_alu_op_ule = 0xA3, - midgard_alu_op_ilt = 0xA4, - midgard_alu_op_ile = 0xA5, - midgard_alu_op_iball_eq = 0xA8, - midgard_alu_op_iball_neq = 0xA9, - midgard_alu_op_uball_lt = 0xAA, - midgard_alu_op_uball_lte = 0xAB, - midgard_alu_op_iball_lt = 0xAC, - midgard_alu_op_iball_lte = 0xAD, + midgard_alu_op_ieq = 0xA0, + midgard_alu_op_ine = 0xA1, + midgard_alu_op_ult = 0xA2, + midgard_alu_op_ule = 0xA3, + midgard_alu_op_ilt = 0xA4, + midgard_alu_op_ile = 0xA5, + midgard_alu_op_iball_eq = 0xA8, + midgard_alu_op_iball_neq = 0xA9, + midgard_alu_op_uball_lt = 0xAA, + midgard_alu_op_uball_lte = 0xAB, + midgard_alu_op_iball_lt = 0xAC, + midgard_alu_op_iball_lte = 0xAD, - midgard_alu_op_ibany_eq = 0xB0, - midgard_alu_op_ibany_neq = 0xB1, - midgard_alu_op_ubany_lt = 0xB2, - midgard_alu_op_ubany_lte = 0xB3, - midgard_alu_op_ibany_lt = 0xB4, /* any(lessThan(.., ..)) */ - midgard_alu_op_ibany_lte = 0xB5, /* any(lessThanEqual(.., ..)) */ - midgard_alu_op_i2f_rte = 0xB8, - midgard_alu_op_i2f_rtz = 0xB9, - midgard_alu_op_i2f_rtn = 0xBA, - midgard_alu_op_i2f_rtp = 0xBB, - midgard_alu_op_u2f_rte = 0xBC, - midgard_alu_op_u2f_rtz = 0xBD, - midgard_alu_op_u2f_rtn = 0xBE, - midgard_alu_op_u2f_rtp = 0xBF, + midgard_alu_op_ibany_eq = 0xB0, + midgard_alu_op_ibany_neq = 0xB1, + midgard_alu_op_ubany_lt = 0xB2, + midgard_alu_op_ubany_lte = 0xB3, + midgard_alu_op_ibany_lt = 0xB4, /* any(lessThan(.., ..)) */ + midgard_alu_op_ibany_lte = 0xB5, /* any(lessThanEqual(.., ..)) */ + midgard_alu_op_i2f_rte = 0xB8, + midgard_alu_op_i2f_rtz = 0xB9, + midgard_alu_op_i2f_rtn = 0xBA, + midgard_alu_op_i2f_rtp = 0xBB, + midgard_alu_op_u2f_rte = 0xBC, + midgard_alu_op_u2f_rtz = 0xBD, + midgard_alu_op_u2f_rtn = 0xBE, + midgard_alu_op_u2f_rtp = 0xBF, - /* All csel* instructions use as a condition the output of the previous - * vector or scalar unit, thus it must run on the second pipeline stage - * and be scheduled to the same bundle as the opcode that it uses as a - * condition. */ - midgard_alu_op_icsel_v = 0xC0, - midgard_alu_op_icsel = 0xC1, - midgard_alu_op_fcsel_v = 0xC4, - midgard_alu_op_fcsel = 0xC5, - midgard_alu_op_froundaway = 0xC6, /* round to nearest away */ + /* All csel* instructions use as a condition the output of the previous + * vector or scalar unit, thus it must run on the second pipeline stage + * and be scheduled to the same bundle as the opcode that it uses as a + * condition. */ + midgard_alu_op_icsel_v = 0xC0, + midgard_alu_op_icsel = 0xC1, + midgard_alu_op_fcsel_v = 0xC4, + midgard_alu_op_fcsel = 0xC5, + midgard_alu_op_froundaway = 0xC6, /* round to nearest away */ - midgard_alu_op_fatan2_pt2 = 0xE8, - midgard_alu_op_fpow_pt1 = 0xEC, - midgard_alu_op_fpown_pt1 = 0xED, - midgard_alu_op_fpowr_pt1 = 0xEE, + midgard_alu_op_fatan2_pt2 = 0xE8, + midgard_alu_op_fpow_pt1 = 0xEC, + midgard_alu_op_fpown_pt1 = 0xED, + midgard_alu_op_fpowr_pt1 = 0xEE, - midgard_alu_op_frcp = 0xF0, - midgard_alu_op_frsqrt = 0xF2, - midgard_alu_op_fsqrt = 0xF3, - midgard_alu_op_fexp2 = 0xF4, - midgard_alu_op_flog2 = 0xF5, - midgard_alu_op_fsinpi = 0xF6, /* sin(pi * x) */ - midgard_alu_op_fcospi = 0xF7, /* cos(pi * x) */ - midgard_alu_op_fatan2_pt1 = 0xF9, + midgard_alu_op_frcp = 0xF0, + midgard_alu_op_frsqrt = 0xF2, + midgard_alu_op_fsqrt = 0xF3, + midgard_alu_op_fexp2 = 0xF4, + midgard_alu_op_flog2 = 0xF5, + midgard_alu_op_fsinpi = 0xF6, /* sin(pi * x) */ + midgard_alu_op_fcospi = 0xF7, /* cos(pi * x) */ + midgard_alu_op_fatan2_pt1 = 0xF9, } midgard_alu_op; typedef enum { - midgard_outmod_none = 0, - midgard_outmod_clamp_0_inf = 1, /* max(x, 0.0), NaNs become +0.0 */ - midgard_outmod_clamp_m1_1 = 2, /* clamp(x, -1.0, 1.0), NaNs become -1.0 */ - midgard_outmod_clamp_0_1 = 3 /* clamp(x, 0.0, 1.0), NaNs become +0.0 */ + midgard_outmod_none = 0, + midgard_outmod_clamp_0_inf = 1, /* max(x, 0.0), NaNs become +0.0 */ + midgard_outmod_clamp_m1_1 = 2, /* clamp(x, -1.0, 1.0), NaNs become -1.0 */ + midgard_outmod_clamp_0_1 = 3 /* clamp(x, 0.0, 1.0), NaNs become +0.0 */ } midgard_outmod_float; -/* These are applied to the resulting value that's going to be stored in the dest reg. - * This should be set to midgard_outmod_keeplo when shrink_mode is midgard_shrink_mode_none. */ +/* These are applied to the resulting value that's going to be stored in the + * dest reg. This should be set to midgard_outmod_keeplo when shrink_mode is + * midgard_shrink_mode_none. */ typedef enum { - midgard_outmod_ssat = 0, - midgard_outmod_usat = 1, - midgard_outmod_keeplo = 2, /* Keep low half */ - midgard_outmod_keephi = 3, /* Keep high half */ + midgard_outmod_ssat = 0, + midgard_outmod_usat = 1, + midgard_outmod_keeplo = 2, /* Keep low half */ + midgard_outmod_keephi = 3, /* Keep high half */ } midgard_outmod_int; typedef enum { - midgard_reg_mode_8 = 0, - midgard_reg_mode_16 = 1, - midgard_reg_mode_32 = 2, - midgard_reg_mode_64 = 3 + midgard_reg_mode_8 = 0, + midgard_reg_mode_16 = 1, + midgard_reg_mode_32 = 2, + midgard_reg_mode_64 = 3 } midgard_reg_mode; typedef enum { - midgard_shrink_mode_lower = 0, - midgard_shrink_mode_upper = 1, - midgard_shrink_mode_none = 2 + midgard_shrink_mode_lower = 0, + midgard_shrink_mode_upper = 1, + midgard_shrink_mode_none = 2 } midgard_shrink_mode; /* Only used if midgard_src_expand_mode is set to one of midgard_src_expand_*. */ typedef enum { - midgard_int_sign_extend = 0, - midgard_int_zero_extend = 1, - midgard_int_replicate = 2, - midgard_int_left_shift = 3 + midgard_int_sign_extend = 0, + midgard_int_zero_extend = 1, + midgard_int_replicate = 2, + midgard_int_left_shift = 3 } midgard_int_mod; -/* Unlike midgard_int_mod, fload modifiers are applied after the expansion happens, so - * they don't depend on midgard_src_expand_mode. */ +/* Unlike midgard_int_mod, fload modifiers are applied after the expansion + * happens, so they don't depend on midgard_src_expand_mode. */ #define MIDGARD_FLOAT_MOD_ABS (1 << 0) #define MIDGARD_FLOAT_MOD_NEG (1 << 1) @@ -281,78 +283,63 @@ typedef enum { * extended, resulting in a vec4 where each 32-bit element corresponds to a * 16-bit element from the low 64-bits of the input vector. */ typedef enum { - midgard_src_passthrough = 0, - midgard_src_rep_low = 1, /* replicate lower 64 bits to higher 64 bits */ - midgard_src_rep_high = 2, /* replicate higher 64 bits to lower 64 bits */ - midgard_src_swap = 3, /* swap lower 64 bits with higher 64 bits */ - midgard_src_expand_low = 4, /* expand low 64 bits */ - midgard_src_expand_high = 5, /* expand high 64 bits */ - midgard_src_expand_low_swap = 6, /* expand low 64 bits, then swap */ - midgard_src_expand_high_swap = 7, /* expand high 64 bits, then swap */ + midgard_src_passthrough = 0, + midgard_src_rep_low = 1, /* replicate lower 64 bits to higher 64 bits */ + midgard_src_rep_high = 2, /* replicate higher 64 bits to lower 64 bits */ + midgard_src_swap = 3, /* swap lower 64 bits with higher 64 bits */ + midgard_src_expand_low = 4, /* expand low 64 bits */ + midgard_src_expand_high = 5, /* expand high 64 bits */ + midgard_src_expand_low_swap = 6, /* expand low 64 bits, then swap */ + midgard_src_expand_high_swap = 7, /* expand high 64 bits, then swap */ } midgard_src_expand_mode; -#define INPUT_EXPANDS(a) \ - (a >= midgard_src_expand_low && a <= midgard_src_expand_high_swap) +#define INPUT_EXPANDS(a) \ + (a >= midgard_src_expand_low && a <= midgard_src_expand_high_swap) -#define INPUT_SWAPS(a) \ - (a == midgard_src_swap || a >= midgard_src_expand_low_swap) +#define INPUT_SWAPS(a) \ + (a == midgard_src_swap || a >= midgard_src_expand_low_swap) -typedef struct -__attribute__((__packed__)) -{ - /* Either midgard_int_mod or from midgard_float_mod_*, depending on the - * type of op */ - unsigned mod : 2; - midgard_src_expand_mode expand_mode : 3; - unsigned swizzle : 8; -} -midgard_vector_alu_src; +typedef struct __attribute__((__packed__)) { + /* Either midgard_int_mod or from midgard_float_mod_*, depending on the + * type of op */ + unsigned mod : 2; + midgard_src_expand_mode expand_mode : 3; + unsigned swizzle : 8; +} midgard_vector_alu_src; -typedef struct -__attribute__((__packed__)) -{ - midgard_alu_op op : 8; - midgard_reg_mode reg_mode : 2; - unsigned src1 : 13; - unsigned src2 : 13; - midgard_shrink_mode shrink_mode : 2; - unsigned outmod : 2; - unsigned mask : 8; -} -midgard_vector_alu; +typedef struct __attribute__((__packed__)) { + midgard_alu_op op : 8; + midgard_reg_mode reg_mode : 2; + unsigned src1 : 13; + unsigned src2 : 13; + midgard_shrink_mode shrink_mode : 2; + unsigned outmod : 2; + unsigned mask : 8; +} midgard_vector_alu; -typedef struct -__attribute__((__packed__)) -{ - unsigned mod : 2; - bool full : 1; /* 0 = 16-bit, 1 = 32-bit */ - unsigned component : 3; -} -midgard_scalar_alu_src; +typedef struct __attribute__((__packed__)) { + unsigned mod : 2; + bool full : 1; /* 0 = 16-bit, 1 = 32-bit */ + unsigned component : 3; +} midgard_scalar_alu_src; -typedef struct -__attribute__((__packed__)) -{ - midgard_alu_op op : 8; - unsigned src1 : 6; - /* last 5 bits are used when src2 is an immediate */ - unsigned src2 : 11; - unsigned reserved : 1; - unsigned outmod : 2; - bool output_full : 1; - unsigned output_component : 3; -} -midgard_scalar_alu; +typedef struct __attribute__((__packed__)) { + midgard_alu_op op : 8; + unsigned src1 : 6; + /* last 5 bits are used when src2 is an immediate */ + unsigned src2 : 11; + unsigned reserved : 1; + unsigned outmod : 2; + bool output_full : 1; + unsigned output_component : 3; +} midgard_scalar_alu; -typedef struct -__attribute__((__packed__)) -{ - unsigned src1_reg : 5; - unsigned src2_reg : 5; - unsigned out_reg : 5; - bool src2_imm : 1; -} -midgard_reg_info; +typedef struct __attribute__((__packed__)) { + unsigned src1_reg : 5; + unsigned src2_reg : 5; + unsigned out_reg : 5; + bool src2_imm : 1; +} midgard_reg_info; /* In addition to conditional branches and jumps (unconditional branches), * Midgard implements a bit of fixed function functionality used in fragment @@ -361,679 +348,647 @@ midgard_reg_info; * fixed-function operation as the branch condition. */ typedef enum { - /* Regular branches */ - midgard_jmp_writeout_op_branch_uncond = 1, - midgard_jmp_writeout_op_branch_cond = 2, + /* Regular branches */ + midgard_jmp_writeout_op_branch_uncond = 1, + midgard_jmp_writeout_op_branch_cond = 2, - /* In a fragment shader, execute a discard_if instruction, with the - * corresponding condition code. Terminates the shader, so generally - * set the branch target to out of the shader */ - midgard_jmp_writeout_op_discard = 4, + /* In a fragment shader, execute a discard_if instruction, with the + * corresponding condition code. Terminates the shader, so generally + * set the branch target to out of the shader */ + midgard_jmp_writeout_op_discard = 4, - /* Branch if the tilebuffer is not yet ready. At the beginning of a - * fragment shader that reads from the tile buffer, for instance via - * ARM_shader_framebuffer_fetch or EXT_pixel_local_storage, this branch - * operation should be used as a loop. An instruction like - * "br.tilebuffer.always -1" does the trick, corresponding to - * "while(!is_tilebuffer_ready) */ - midgard_jmp_writeout_op_tilebuffer_pending = 6, + /* Branch if the tilebuffer is not yet ready. At the beginning of a + * fragment shader that reads from the tile buffer, for instance via + * ARM_shader_framebuffer_fetch or EXT_pixel_local_storage, this branch + * operation should be used as a loop. An instruction like + * "br.tilebuffer.always -1" does the trick, corresponding to + * "while(!is_tilebuffer_ready) */ + midgard_jmp_writeout_op_tilebuffer_pending = 6, - /* In a fragment shader, try to write out the value pushed to r0 to the - * tilebuffer, subject to state in r1.z and r1.w. If this - * succeeds, the shader terminates. If it fails, it branches to the - * specified branch target. Generally, this should be used in a loop to - * itself, acting as "do { write(r0); } while(!write_successful);" */ - midgard_jmp_writeout_op_writeout = 7, + /* In a fragment shader, try to write out the value pushed to r0 to the + * tilebuffer, subject to state in r1.z and r1.w. If this + * succeeds, the shader terminates. If it fails, it branches to the + * specified branch target. Generally, this should be used in a loop to + * itself, acting as "do { write(r0); } while(!write_successful);" */ + midgard_jmp_writeout_op_writeout = 7, } midgard_jmp_writeout_op; typedef enum { - midgard_condition_write0 = 0, + midgard_condition_write0 = 0, - /* These condition codes denote a conditional branch on FALSE and on - * TRUE respectively */ - midgard_condition_false = 1, - midgard_condition_true = 2, + /* These condition codes denote a conditional branch on FALSE and on + * TRUE respectively */ + midgard_condition_false = 1, + midgard_condition_true = 2, - /* This condition code always branches. For a pure branch, the - * unconditional branch coding should be used instead, but for - * fixed-function branch opcodes, this is still useful */ - midgard_condition_always = 3, + /* This condition code always branches. For a pure branch, the + * unconditional branch coding should be used instead, but for + * fixed-function branch opcodes, this is still useful */ + midgard_condition_always = 3, } midgard_condition; enum midgard_call_mode { - midgard_call_mode_default = 1, - midgard_call_mode_call = 2, - midgard_call_mode_return = 3 + midgard_call_mode_default = 1, + midgard_call_mode_call = 2, + midgard_call_mode_return = 3 }; -typedef struct -__attribute__((__packed__)) -{ - midgard_jmp_writeout_op op : 3; /* == branch_uncond */ - unsigned dest_tag : 4; /* tag of branch destination */ - enum midgard_call_mode call_mode : 2; - int offset : 7; -} -midgard_branch_uncond; +typedef struct __attribute__((__packed__)) { + midgard_jmp_writeout_op op : 3; /* == branch_uncond */ + unsigned dest_tag : 4; /* tag of branch destination */ + enum midgard_call_mode call_mode : 2; + int offset : 7; +} midgard_branch_uncond; -typedef struct -__attribute__((__packed__)) -{ - midgard_jmp_writeout_op op : 3; /* == branch_cond */ - unsigned dest_tag : 4; /* tag of branch destination */ - int offset : 7; - midgard_condition cond : 2; -} -midgard_branch_cond; +typedef struct __attribute__((__packed__)) { + midgard_jmp_writeout_op op : 3; /* == branch_cond */ + unsigned dest_tag : 4; /* tag of branch destination */ + int offset : 7; + midgard_condition cond : 2; +} midgard_branch_cond; -typedef struct -__attribute__((__packed__)) -{ - midgard_jmp_writeout_op op : 3; /* == branch_cond */ - unsigned dest_tag : 4; /* tag of branch destination */ - enum midgard_call_mode call_mode : 2; - signed offset : 23; +typedef struct __attribute__((__packed__)) { + midgard_jmp_writeout_op op : 3; /* == branch_cond */ + unsigned dest_tag : 4; /* tag of branch destination */ + enum midgard_call_mode call_mode : 2; + signed offset : 23; - /* Extended branches permit inputting up to 4 conditions loaded into - * r31 (two in r31.w and two in r31.x). In the most general case, we - * specify a function f(A, B, C, D) mapping 4 1-bit conditions to a - * single 1-bit branch criteria. Note that the domain of f has 2^(2^4) - * elements, each mapping to 1-bit of output, so we can trivially - * construct a Godel numbering of f as a (2^4)=16-bit integer. This - * 16-bit integer serves as a lookup table to compute f, subject to - * some swaps for ordering. - * - * Interesting, the standard 2-bit condition codes are also a LUT with - * the same format (2^1-bit), but it's usually easier to use enums. */ + /* Extended branches permit inputting up to 4 conditions loaded into + * r31 (two in r31.w and two in r31.x). In the most general case, we + * specify a function f(A, B, C, D) mapping 4 1-bit conditions to a + * single 1-bit branch criteria. Note that the domain of f has 2^(2^4) + * elements, each mapping to 1-bit of output, so we can trivially + * construct a Godel numbering of f as a (2^4)=16-bit integer. This + * 16-bit integer serves as a lookup table to compute f, subject to + * some swaps for ordering. + * + * Interesting, the standard 2-bit condition codes are also a LUT with + * the same format (2^1-bit), but it's usually easier to use enums. */ - unsigned cond : 16; -} -midgard_branch_extended; + unsigned cond : 16; +} midgard_branch_extended; -typedef struct -__attribute__((__packed__)) -{ - midgard_jmp_writeout_op op : 3; /* == writeout */ - unsigned unknown : 13; -} -midgard_writeout; +typedef struct __attribute__((__packed__)) { + midgard_jmp_writeout_op op : 3; /* == writeout */ + unsigned unknown : 13; +} midgard_writeout; /* * Load/store words */ typedef enum { - midgard_op_ld_st_noop = 0x03, + midgard_op_ld_st_noop = 0x03, - /* Unpacks a colour from a native format to */ - midgard_op_unpack_colour_f32 = 0x04, - midgard_op_unpack_colour_f16 = 0x05, - midgard_op_unpack_colour_u32 = 0x06, - midgard_op_unpack_colour_s32 = 0x07, + /* Unpacks a colour from a native format to */ + midgard_op_unpack_colour_f32 = 0x04, + midgard_op_unpack_colour_f16 = 0x05, + midgard_op_unpack_colour_u32 = 0x06, + midgard_op_unpack_colour_s32 = 0x07, - /* Packs a colour from to a native format */ - midgard_op_pack_colour_f32 = 0x08, - midgard_op_pack_colour_f16 = 0x09, - midgard_op_pack_colour_u32 = 0x0A, - midgard_op_pack_colour_s32 = 0x0B, + /* Packs a colour from to a native format */ + midgard_op_pack_colour_f32 = 0x08, + midgard_op_pack_colour_f16 = 0x09, + midgard_op_pack_colour_u32 = 0x0A, + midgard_op_pack_colour_s32 = 0x0B, - /* Computes the effective address of a mem address expression */ - midgard_op_lea = 0x0C, + /* Computes the effective address of a mem address expression */ + midgard_op_lea = 0x0C, - /* Converts image coordinates into mem address */ - midgard_op_lea_image = 0x0D, + /* Converts image coordinates into mem address */ + midgard_op_lea_image = 0x0D, - /* Unclear why this is on the L/S unit, but moves fp32 cube map - * coordinates in r27 to its cube map texture coordinate destination - * (e.g r29). */ + /* Unclear why this is on the L/S unit, but moves fp32 cube map + * coordinates in r27 to its cube map texture coordinate destination + * (e.g r29). */ - midgard_op_ld_cubemap_coords = 0x0E, + midgard_op_ld_cubemap_coords = 0x0E, - /* A mov between registers that the ldst pipeline can access */ - midgard_op_ldst_mov = 0x10, + /* A mov between registers that the ldst pipeline can access */ + midgard_op_ldst_mov = 0x10, - /* The L/S unit can do perspective division a clock faster than the ALU - * if you're lucky. Put the vec4 in r27, and call with 0x24 as the - * unknown state; the output will be . Replace w with - * z for the z version */ - midgard_op_ldst_perspective_div_y = 0x11, - midgard_op_ldst_perspective_div_z = 0x12, - midgard_op_ldst_perspective_div_w = 0x13, + /* The L/S unit can do perspective division a clock faster than the ALU + * if you're lucky. Put the vec4 in r27, and call with 0x24 as the + * unknown state; the output will be . Replace w with + * z for the z version */ + midgard_op_ldst_perspective_div_y = 0x11, + midgard_op_ldst_perspective_div_z = 0x12, + midgard_op_ldst_perspective_div_w = 0x13, - /* val in r27.y, address embedded, outputs result to argument. Invert val for sub. Let val = +-1 for inc/dec. */ - midgard_op_atomic_add = 0x40, - midgard_op_atomic_add64 = 0x41, - midgard_op_atomic_add_be = 0x42, - midgard_op_atomic_add64_be = 0x43, + /* val in r27.y, address embedded, outputs result to argument. Invert val for + sub. Let val = +-1 for inc/dec. */ + midgard_op_atomic_add = 0x40, + midgard_op_atomic_add64 = 0x41, + midgard_op_atomic_add_be = 0x42, + midgard_op_atomic_add64_be = 0x43, - midgard_op_atomic_and = 0x44, - midgard_op_atomic_and64 = 0x45, - midgard_op_atomic_and_be = 0x46, - midgard_op_atomic_and64_be = 0x47, - midgard_op_atomic_or = 0x48, - midgard_op_atomic_or64 = 0x49, - midgard_op_atomic_or_be = 0x4A, - midgard_op_atomic_or64_be = 0x4B, - midgard_op_atomic_xor = 0x4C, - midgard_op_atomic_xor64 = 0x4D, - midgard_op_atomic_xor_be = 0x4E, - midgard_op_atomic_xor64_be = 0x4F, + midgard_op_atomic_and = 0x44, + midgard_op_atomic_and64 = 0x45, + midgard_op_atomic_and_be = 0x46, + midgard_op_atomic_and64_be = 0x47, + midgard_op_atomic_or = 0x48, + midgard_op_atomic_or64 = 0x49, + midgard_op_atomic_or_be = 0x4A, + midgard_op_atomic_or64_be = 0x4B, + midgard_op_atomic_xor = 0x4C, + midgard_op_atomic_xor64 = 0x4D, + midgard_op_atomic_xor_be = 0x4E, + midgard_op_atomic_xor64_be = 0x4F, - midgard_op_atomic_imin = 0x50, - midgard_op_atomic_imin64 = 0x51, - midgard_op_atomic_imin_be = 0x52, - midgard_op_atomic_imin64_be = 0x53, - midgard_op_atomic_umin = 0x54, - midgard_op_atomic_umin64 = 0x55, - midgard_op_atomic_umin_be = 0x56, - midgard_op_atomic_umin64_be = 0x57, - midgard_op_atomic_imax = 0x58, - midgard_op_atomic_imax64 = 0x59, - midgard_op_atomic_imax_be = 0x5A, - midgard_op_atomic_imax64_be = 0x5B, - midgard_op_atomic_umax = 0x5C, - midgard_op_atomic_umax64 = 0x5D, - midgard_op_atomic_umax_be = 0x5E, - midgard_op_atomic_umax64_be = 0x5F, + midgard_op_atomic_imin = 0x50, + midgard_op_atomic_imin64 = 0x51, + midgard_op_atomic_imin_be = 0x52, + midgard_op_atomic_imin64_be = 0x53, + midgard_op_atomic_umin = 0x54, + midgard_op_atomic_umin64 = 0x55, + midgard_op_atomic_umin_be = 0x56, + midgard_op_atomic_umin64_be = 0x57, + midgard_op_atomic_imax = 0x58, + midgard_op_atomic_imax64 = 0x59, + midgard_op_atomic_imax_be = 0x5A, + midgard_op_atomic_imax64_be = 0x5B, + midgard_op_atomic_umax = 0x5C, + midgard_op_atomic_umax64 = 0x5D, + midgard_op_atomic_umax_be = 0x5E, + midgard_op_atomic_umax64_be = 0x5F, - midgard_op_atomic_xchg = 0x60, - midgard_op_atomic_xchg64 = 0x61, - midgard_op_atomic_xchg_be = 0x62, - midgard_op_atomic_xchg64_be = 0x63, + midgard_op_atomic_xchg = 0x60, + midgard_op_atomic_xchg64 = 0x61, + midgard_op_atomic_xchg_be = 0x62, + midgard_op_atomic_xchg64_be = 0x63, - midgard_op_atomic_cmpxchg = 0x64, - midgard_op_atomic_cmpxchg64 = 0x65, - midgard_op_atomic_cmpxchg_be = 0x66, - midgard_op_atomic_cmpxchg64_be = 0x67, + midgard_op_atomic_cmpxchg = 0x64, + midgard_op_atomic_cmpxchg64 = 0x65, + midgard_op_atomic_cmpxchg_be = 0x66, + midgard_op_atomic_cmpxchg64_be = 0x67, - /* Used for compute shader's __global arguments, __local - * variables (or for register spilling) */ + /* Used for compute shader's __global arguments, __local + * variables (or for register spilling) */ - midgard_op_ld_u8 = 0x80, /* zero extends */ - midgard_op_ld_i8 = 0x81, /* sign extends */ - midgard_op_ld_u16 = 0x84, /* zero extends */ - midgard_op_ld_i16 = 0x85, /* sign extends */ - midgard_op_ld_u16_be = 0x86, /* zero extends, big endian */ - midgard_op_ld_i16_be = 0x87, /* sign extends, big endian */ - midgard_op_ld_32 = 0x88, /* short2, int, float */ - midgard_op_ld_32_bswap2 = 0x89, /* 16-bit big endian vector */ - midgard_op_ld_32_bswap4 = 0x8A, /* 32-bit big endian scalar */ - midgard_op_ld_64 = 0x8C, /* int2, float2, long */ - midgard_op_ld_64_bswap2 = 0x8D, /* 16-bit big endian vector */ - midgard_op_ld_64_bswap4 = 0x8E, /* 32-bit big endian vector */ - midgard_op_ld_64_bswap8 = 0x8F, /* 64-bit big endian scalar */ - midgard_op_ld_128 = 0x90, /* float4, long2 */ - midgard_op_ld_128_bswap2 = 0x91, /* 16-bit big endian vector */ - midgard_op_ld_128_bswap4 = 0x92, /* 32-bit big endian vector */ - midgard_op_ld_128_bswap8 = 0x93, /* 64-bit big endian vector */ + midgard_op_ld_u8 = 0x80, /* zero extends */ + midgard_op_ld_i8 = 0x81, /* sign extends */ + midgard_op_ld_u16 = 0x84, /* zero extends */ + midgard_op_ld_i16 = 0x85, /* sign extends */ + midgard_op_ld_u16_be = 0x86, /* zero extends, big endian */ + midgard_op_ld_i16_be = 0x87, /* sign extends, big endian */ + midgard_op_ld_32 = 0x88, /* short2, int, float */ + midgard_op_ld_32_bswap2 = 0x89, /* 16-bit big endian vector */ + midgard_op_ld_32_bswap4 = 0x8A, /* 32-bit big endian scalar */ + midgard_op_ld_64 = 0x8C, /* int2, float2, long */ + midgard_op_ld_64_bswap2 = 0x8D, /* 16-bit big endian vector */ + midgard_op_ld_64_bswap4 = 0x8E, /* 32-bit big endian vector */ + midgard_op_ld_64_bswap8 = 0x8F, /* 64-bit big endian scalar */ + midgard_op_ld_128 = 0x90, /* float4, long2 */ + midgard_op_ld_128_bswap2 = 0x91, /* 16-bit big endian vector */ + midgard_op_ld_128_bswap4 = 0x92, /* 32-bit big endian vector */ + midgard_op_ld_128_bswap8 = 0x93, /* 64-bit big endian vector */ - midgard_op_ld_attr_32 = 0x94, - midgard_op_ld_attr_16 = 0x95, - midgard_op_ld_attr_32u = 0x96, - midgard_op_ld_attr_32i = 0x97, - midgard_op_ld_vary_32 = 0x98, - midgard_op_ld_vary_16 = 0x99, - midgard_op_ld_vary_32u = 0x9A, - midgard_op_ld_vary_32i = 0x9B, + midgard_op_ld_attr_32 = 0x94, + midgard_op_ld_attr_16 = 0x95, + midgard_op_ld_attr_32u = 0x96, + midgard_op_ld_attr_32i = 0x97, + midgard_op_ld_vary_32 = 0x98, + midgard_op_ld_vary_16 = 0x99, + midgard_op_ld_vary_32u = 0x9A, + midgard_op_ld_vary_32i = 0x9B, - /* This instruction behaves differently depending if the gpu is a v4 - * or a newer gpu. The main difference hinges on which values of the - * second argument are valid for each gpu. - * TODO: properly document and decode each possible value for the - * second argument. */ - midgard_op_ld_special_32f = 0x9C, - midgard_op_ld_special_16f = 0x9D, - midgard_op_ld_special_32u = 0x9E, - midgard_op_ld_special_32i = 0x9F, + /* This instruction behaves differently depending if the gpu is a v4 + * or a newer gpu. The main difference hinges on which values of the + * second argument are valid for each gpu. + * TODO: properly document and decode each possible value for the + * second argument. */ + midgard_op_ld_special_32f = 0x9C, + midgard_op_ld_special_16f = 0x9D, + midgard_op_ld_special_32u = 0x9E, + midgard_op_ld_special_32i = 0x9F, - /* The distinction between these ops is the alignment - * requirement / accompanying shift. Thus, the offset to - * ld_ubo_128 is in 16-byte units and can load 128-bit. The - * offset to ld_ubo_64 is in 8-byte units; ld_ubo_32 in 4-byte - * units. */ - midgard_op_ld_ubo_u8 = 0xA0, /* theoretical */ - midgard_op_ld_ubo_i8 = 0xA1, /* theoretical */ - midgard_op_ld_ubo_u16 = 0xA4, /* theoretical */ - midgard_op_ld_ubo_i16 = 0xA5, /* theoretical */ - midgard_op_ld_ubo_u16_be = 0xA6, /* theoretical */ - midgard_op_ld_ubo_i16_be = 0xA7, /* theoretical */ - midgard_op_ld_ubo_32 = 0xA8, - midgard_op_ld_ubo_32_bswap2 = 0xA9, - midgard_op_ld_ubo_32_bswap4 = 0xAA, - midgard_op_ld_ubo_64 = 0xAC, - midgard_op_ld_ubo_64_bswap2 = 0xAD, - midgard_op_ld_ubo_64_bswap4 = 0xAE, - midgard_op_ld_ubo_64_bswap8 = 0xAF, - midgard_op_ld_ubo_128 = 0xB0, - midgard_op_ld_ubo_128_bswap2 = 0xB1, - midgard_op_ld_ubo_128_bswap4 = 0xB2, - midgard_op_ld_ubo_128_bswap8 = 0xB3, + /* The distinction between these ops is the alignment + * requirement / accompanying shift. Thus, the offset to + * ld_ubo_128 is in 16-byte units and can load 128-bit. The + * offset to ld_ubo_64 is in 8-byte units; ld_ubo_32 in 4-byte + * units. */ + midgard_op_ld_ubo_u8 = 0xA0, /* theoretical */ + midgard_op_ld_ubo_i8 = 0xA1, /* theoretical */ + midgard_op_ld_ubo_u16 = 0xA4, /* theoretical */ + midgard_op_ld_ubo_i16 = 0xA5, /* theoretical */ + midgard_op_ld_ubo_u16_be = 0xA6, /* theoretical */ + midgard_op_ld_ubo_i16_be = 0xA7, /* theoretical */ + midgard_op_ld_ubo_32 = 0xA8, + midgard_op_ld_ubo_32_bswap2 = 0xA9, + midgard_op_ld_ubo_32_bswap4 = 0xAA, + midgard_op_ld_ubo_64 = 0xAC, + midgard_op_ld_ubo_64_bswap2 = 0xAD, + midgard_op_ld_ubo_64_bswap4 = 0xAE, + midgard_op_ld_ubo_64_bswap8 = 0xAF, + midgard_op_ld_ubo_128 = 0xB0, + midgard_op_ld_ubo_128_bswap2 = 0xB1, + midgard_op_ld_ubo_128_bswap4 = 0xB2, + midgard_op_ld_ubo_128_bswap8 = 0xB3, - midgard_op_ld_image_32f = 0xB4, - midgard_op_ld_image_16f = 0xB5, - midgard_op_ld_image_32u = 0xB6, - midgard_op_ld_image_32i = 0xB7, + midgard_op_ld_image_32f = 0xB4, + midgard_op_ld_image_16f = 0xB5, + midgard_op_ld_image_32u = 0xB6, + midgard_op_ld_image_32i = 0xB7, - /* Only works on v5 or newer. - * Older cards must use ld_special with tilebuffer selectors. */ - midgard_op_ld_tilebuffer_32f = 0xB8, - midgard_op_ld_tilebuffer_16f = 0xB9, - midgard_op_ld_tilebuffer_raw = 0xBA, + /* Only works on v5 or newer. + * Older cards must use ld_special with tilebuffer selectors. */ + midgard_op_ld_tilebuffer_32f = 0xB8, + midgard_op_ld_tilebuffer_16f = 0xB9, + midgard_op_ld_tilebuffer_raw = 0xBA, - midgard_op_st_u8 = 0xC0, /* zero extends */ - midgard_op_st_i8 = 0xC1, /* sign extends */ - midgard_op_st_u16 = 0xC4, /* zero extends */ - midgard_op_st_i16 = 0xC5, /* sign extends */ - midgard_op_st_u16_be = 0xC6, /* zero extends, big endian */ - midgard_op_st_i16_be = 0xC7, /* sign extends, big endian */ - midgard_op_st_32 = 0xC8, /* short2, int, float */ - midgard_op_st_32_bswap2 = 0xC9, /* 16-bit big endian vector */ - midgard_op_st_32_bswap4 = 0xCA, /* 32-bit big endian scalar */ - midgard_op_st_64 = 0xCC, /* int2, float2, long */ - midgard_op_st_64_bswap2 = 0xCD, /* 16-bit big endian vector */ - midgard_op_st_64_bswap4 = 0xCE, /* 32-bit big endian vector */ - midgard_op_st_64_bswap8 = 0xCF, /* 64-bit big endian scalar */ - midgard_op_st_128 = 0xD0, /* float4, long2 */ - midgard_op_st_128_bswap2 = 0xD1, /* 16-bit big endian vector */ - midgard_op_st_128_bswap4 = 0xD2, /* 32-bit big endian vector */ - midgard_op_st_128_bswap8 = 0xD3, /* 64-bit big endian vector */ + midgard_op_st_u8 = 0xC0, /* zero extends */ + midgard_op_st_i8 = 0xC1, /* sign extends */ + midgard_op_st_u16 = 0xC4, /* zero extends */ + midgard_op_st_i16 = 0xC5, /* sign extends */ + midgard_op_st_u16_be = 0xC6, /* zero extends, big endian */ + midgard_op_st_i16_be = 0xC7, /* sign extends, big endian */ + midgard_op_st_32 = 0xC8, /* short2, int, float */ + midgard_op_st_32_bswap2 = 0xC9, /* 16-bit big endian vector */ + midgard_op_st_32_bswap4 = 0xCA, /* 32-bit big endian scalar */ + midgard_op_st_64 = 0xCC, /* int2, float2, long */ + midgard_op_st_64_bswap2 = 0xCD, /* 16-bit big endian vector */ + midgard_op_st_64_bswap4 = 0xCE, /* 32-bit big endian vector */ + midgard_op_st_64_bswap8 = 0xCF, /* 64-bit big endian scalar */ + midgard_op_st_128 = 0xD0, /* float4, long2 */ + midgard_op_st_128_bswap2 = 0xD1, /* 16-bit big endian vector */ + midgard_op_st_128_bswap4 = 0xD2, /* 32-bit big endian vector */ + midgard_op_st_128_bswap8 = 0xD3, /* 64-bit big endian vector */ - midgard_op_st_vary_32 = 0xD4, - midgard_op_st_vary_16 = 0xD5, - midgard_op_st_vary_32u = 0xD6, - midgard_op_st_vary_32i = 0xD7, + midgard_op_st_vary_32 = 0xD4, + midgard_op_st_vary_16 = 0xD5, + midgard_op_st_vary_32u = 0xD6, + midgard_op_st_vary_32i = 0xD7, - /* Value to st in r27, location r26.w as short2 */ - midgard_op_st_image_32f = 0xD8, - midgard_op_st_image_16f = 0xD9, - midgard_op_st_image_32u = 0xDA, - midgard_op_st_image_32i = 0xDB, + /* Value to st in r27, location r26.w as short2 */ + midgard_op_st_image_32f = 0xD8, + midgard_op_st_image_16f = 0xD9, + midgard_op_st_image_32u = 0xDA, + midgard_op_st_image_32i = 0xDB, - midgard_op_st_special_32f = 0xDC, - midgard_op_st_special_16f = 0xDD, - midgard_op_st_special_32u = 0xDE, - midgard_op_st_special_32i = 0xDF, + midgard_op_st_special_32f = 0xDC, + midgard_op_st_special_16f = 0xDD, + midgard_op_st_special_32u = 0xDE, + midgard_op_st_special_32i = 0xDF, - /* Only works on v5 or newer. - * Older cards must use ld_special with tilebuffer selectors. */ - midgard_op_st_tilebuffer_32f = 0xE8, - midgard_op_st_tilebuffer_16f = 0xE9, - midgard_op_st_tilebuffer_raw = 0xEA, - midgard_op_trap = 0xFC, + /* Only works on v5 or newer. + * Older cards must use ld_special with tilebuffer selectors. */ + midgard_op_st_tilebuffer_32f = 0xE8, + midgard_op_st_tilebuffer_16f = 0xE9, + midgard_op_st_tilebuffer_raw = 0xEA, + midgard_op_trap = 0xFC, } midgard_load_store_op; typedef enum { - midgard_interp_sample = 0, - midgard_interp_centroid = 1, - midgard_interp_default = 2 + midgard_interp_sample = 0, + midgard_interp_centroid = 1, + midgard_interp_default = 2 } midgard_interpolation; typedef enum { - midgard_varying_mod_none = 0, + midgard_varying_mod_none = 0, - /* Take the would-be result and divide all components by its y/z/w - * (perspective division baked in with the load) */ - midgard_varying_mod_perspective_y = 1, - midgard_varying_mod_perspective_z = 2, - midgard_varying_mod_perspective_w = 3, + /* Take the would-be result and divide all components by its y/z/w + * (perspective division baked in with the load) */ + midgard_varying_mod_perspective_y = 1, + midgard_varying_mod_perspective_z = 2, + midgard_varying_mod_perspective_w = 3, - /* The result is a 64-bit cubemap descriptor to use with - * midgard_tex_op_normal or midgard_tex_op_gradient */ - midgard_varying_mod_cubemap = 4, + /* The result is a 64-bit cubemap descriptor to use with + * midgard_tex_op_normal or midgard_tex_op_gradient */ + midgard_varying_mod_cubemap = 4, } midgard_varying_modifier; -typedef struct -__attribute__((__packed__)) -{ - midgard_varying_modifier modifier : 3; +typedef struct __attribute__((__packed__)) { + midgard_varying_modifier modifier : 3; - bool flat_shading : 1; + bool flat_shading : 1; - /* These are ignored if flat_shading is enabled. */ - bool perspective_correction : 1; - bool centroid_mapping : 1; + /* These are ignored if flat_shading is enabled. */ + bool perspective_correction : 1; + bool centroid_mapping : 1; - /* This is ignored if the shader only runs once per pixel. */ - bool interpolate_sample : 1; + /* This is ignored if the shader only runs once per pixel. */ + bool interpolate_sample : 1; - bool zero0 : 1; /* Always zero */ + bool zero0 : 1; /* Always zero */ - unsigned direct_sample_pos_x : 4; - unsigned direct_sample_pos_y : 4; -} -midgard_varying_params; + unsigned direct_sample_pos_x : 4; + unsigned direct_sample_pos_y : 4; +} midgard_varying_params; /* 8-bit register/etc selector for load/store ops */ -typedef struct -__attribute__((__packed__)) -{ - /* Indexes into the register */ - unsigned component : 2; +typedef struct __attribute__((__packed__)) { + /* Indexes into the register */ + unsigned component : 2; - /* Register select between r26/r27 */ - unsigned select : 1; + /* Register select between r26/r27 */ + unsigned select : 1; - unsigned unknown : 2; + unsigned unknown : 2; - /* Like any good Arm instruction set, load/store arguments can be - * implicitly left-shifted... but only the second argument. Zero for no - * shifting, up to <<7 possible though. This is useful for indexing. - * - * For the first argument, it's unknown what these bits mean */ - unsigned shift : 3; -} -midgard_ldst_register_select; + /* Like any good Arm instruction set, load/store arguments can be + * implicitly left-shifted... but only the second argument. Zero for no + * shifting, up to <<7 possible though. This is useful for indexing. + * + * For the first argument, it's unknown what these bits mean */ + unsigned shift : 3; +} midgard_ldst_register_select; typedef enum { - /* 0 is reserved */ - midgard_index_address_u64 = 1, - midgard_index_address_u32 = 2, - midgard_index_address_s32 = 3, + /* 0 is reserved */ + midgard_index_address_u64 = 1, + midgard_index_address_u32 = 2, + midgard_index_address_s32 = 3, } midgard_index_address_format; -typedef struct -__attribute__((__packed__)) -{ - midgard_load_store_op op : 8; +typedef struct __attribute__((__packed__)) { + midgard_load_store_op op : 8; - /* Source/dest reg */ - unsigned reg : 5; + /* Source/dest reg */ + unsigned reg : 5; - /* Generally is a writemask. - * For ST_ATTR and ST_TEX, unused. - * For other stores, each bit masks 1/4th of the output. */ - unsigned mask : 4; + /* Generally is a writemask. + * For ST_ATTR and ST_TEX, unused. + * For other stores, each bit masks 1/4th of the output. */ + unsigned mask : 4; - /* Swizzle for stores, but for atomics it encodes also the source - * register. This fits because atomics dont need a swizzle since they - * are not vectorized instructions. */ - unsigned swizzle : 8; + /* Swizzle for stores, but for atomics it encodes also the source + * register. This fits because atomics dont need a swizzle since they + * are not vectorized instructions. */ + unsigned swizzle : 8; - /* Arg reg, meaning changes according to each opcode */ - unsigned arg_comp : 2; - unsigned arg_reg : 3; + /* Arg reg, meaning changes according to each opcode */ + unsigned arg_comp : 2; + unsigned arg_reg : 3; - /* 64-bit address enable - * 32-bit data type enable for CUBEMAP and perspective div. - * Explicit indexing enable for LD_ATTR. - * 64-bit coordinate enable for LD_IMAGE. */ - bool bitsize_toggle : 1; + /* 64-bit address enable + * 32-bit data type enable for CUBEMAP and perspective div. + * Explicit indexing enable for LD_ATTR. + * 64-bit coordinate enable for LD_IMAGE. */ + bool bitsize_toggle : 1; - /* These are mainly used for opcodes that have addresses. - * For cmpxchg, index_reg is used for the comparison value. - * For ops that access the attrib table, bit 1 encodes which table. - * For LD_VAR and LD/ST_ATTR, bit 0 enables dest/src type inferral. */ - midgard_index_address_format index_format : 2; - unsigned index_comp : 2; - unsigned index_reg : 3; - unsigned index_shift : 4; + /* These are mainly used for opcodes that have addresses. + * For cmpxchg, index_reg is used for the comparison value. + * For ops that access the attrib table, bit 1 encodes which table. + * For LD_VAR and LD/ST_ATTR, bit 0 enables dest/src type inferral. */ + midgard_index_address_format index_format : 2; + unsigned index_comp : 2; + unsigned index_reg : 3; + unsigned index_shift : 4; - /* Generaly is a signed offset, but has different bitsize and starts at - * different bits depending on the opcode, LDST_*_DISPLACEMENT helpers - * are recommended when packing/unpacking this attribute. - * For LD_UBO, bit 0 enables ubo index immediate. - * For LD_TILEBUFFER_RAW, bit 0 disables sample index immediate. */ - int signed_offset : 18; -} -midgard_load_store_word; + /* Generaly is a signed offset, but has different bitsize and starts at + * different bits depending on the opcode, LDST_*_DISPLACEMENT helpers + * are recommended when packing/unpacking this attribute. + * For LD_UBO, bit 0 enables ubo index immediate. + * For LD_TILEBUFFER_RAW, bit 0 disables sample index immediate. */ + int signed_offset : 18; +} midgard_load_store_word; -typedef struct -__attribute__((__packed__)) -{ - unsigned type : 4; - unsigned next_type : 4; - uint64_t word1 : 60; - uint64_t word2 : 60; -} -midgard_load_store; +typedef struct __attribute__((__packed__)) { + unsigned type : 4; + unsigned next_type : 4; + uint64_t word1 : 60; + uint64_t word2 : 60; +} midgard_load_store; /* 8-bit register selector used in texture ops to select a bias/LOD/gradient * register, shoved into the `bias` field */ -typedef struct -__attribute__((__packed__)) -{ - /* 32-bit register, clear for half-register */ - unsigned full : 1; +typedef struct __attribute__((__packed__)) { + /* 32-bit register, clear for half-register */ + unsigned full : 1; - /* Register select between r28/r29 */ - unsigned select : 1; + /* Register select between r28/r29 */ + unsigned select : 1; - /* For a half-register, selects the upper half */ - unsigned upper : 1; + /* For a half-register, selects the upper half */ + unsigned upper : 1; - /* Indexes into the register */ - unsigned component : 2; + /* Indexes into the register */ + unsigned component : 2; - /* Padding to make this 8-bit */ - unsigned zero : 3; -} -midgard_tex_register_select; + /* Padding to make this 8-bit */ + unsigned zero : 3; +} midgard_tex_register_select; /* Texture pipeline results are in r28-r29 */ #define REG_TEX_BASE 28 enum mali_texture_op { - /* [texture + LOD bias] - * If the texture is mipmapped, barriers must be enabled in the - * instruction word in order for this opcode to compute the output - * correctly. */ - midgard_tex_op_normal = 1, + /* [texture + LOD bias] + * If the texture is mipmapped, barriers must be enabled in the + * instruction word in order for this opcode to compute the output + * correctly. */ + midgard_tex_op_normal = 1, - /* [texture + gradient for LOD and anisotropy] - * Unlike midgard_tex_op_normal, this opcode does not require barriers - * to compute the output correctly. */ - midgard_tex_op_gradient = 2, + /* [texture + gradient for LOD and anisotropy] + * Unlike midgard_tex_op_normal, this opcode does not require barriers + * to compute the output correctly. */ + midgard_tex_op_gradient = 2, - /* [unfiltered texturing] - * Unlike midgard_tex_op_normal, this opcode does not require barriers - * to compute the output correctly. */ - midgard_tex_op_fetch = 4, + /* [unfiltered texturing] + * Unlike midgard_tex_op_normal, this opcode does not require barriers + * to compute the output correctly. */ + midgard_tex_op_fetch = 4, - /* [gradient from derivative] */ - midgard_tex_op_grad_from_derivative = 9, + /* [gradient from derivative] */ + midgard_tex_op_grad_from_derivative = 9, - /* [mov] */ - midgard_tex_op_mov = 10, + /* [mov] */ + midgard_tex_op_mov = 10, - /* [noop] - * Mostly used for barriers. */ - midgard_tex_op_barrier = 11, + /* [noop] + * Mostly used for barriers. */ + midgard_tex_op_barrier = 11, - /* [gradient from coords] */ - midgard_tex_op_grad_from_coords = 12, + /* [gradient from coords] */ + midgard_tex_op_grad_from_coords = 12, - /* [derivative] - * Computes derivatives in 2x2 fragment blocks. */ - midgard_tex_op_derivative = 13 + /* [derivative] + * Computes derivatives in 2x2 fragment blocks. */ + midgard_tex_op_derivative = 13 }; enum mali_sampler_type { - /* 0 is reserved */ - MALI_SAMPLER_FLOAT = 0x1, /* sampler */ - MALI_SAMPLER_UNSIGNED = 0x2, /* usampler */ - MALI_SAMPLER_SIGNED = 0x3, /* isampler */ + /* 0 is reserved */ + MALI_SAMPLER_FLOAT = 0x1, /* sampler */ + MALI_SAMPLER_UNSIGNED = 0x2, /* usampler */ + MALI_SAMPLER_SIGNED = 0x3, /* isampler */ }; /* Texture modes */ enum mali_texture_mode { - TEXTURE_NORMAL = 1, - TEXTURE_SHADOW = 5, - TEXTURE_GATHER_SHADOW = 6, - TEXTURE_GATHER_X = 8, - TEXTURE_GATHER_Y = 9, - TEXTURE_GATHER_Z = 10, - TEXTURE_GATHER_W = 11, + TEXTURE_NORMAL = 1, + TEXTURE_SHADOW = 5, + TEXTURE_GATHER_SHADOW = 6, + TEXTURE_GATHER_X = 8, + TEXTURE_GATHER_Y = 9, + TEXTURE_GATHER_Z = 10, + TEXTURE_GATHER_W = 11, }; enum mali_derivative_mode { - TEXTURE_DFDX = 0, - TEXTURE_DFDY = 1, + TEXTURE_DFDX = 0, + TEXTURE_DFDY = 1, }; enum midgard_partial_execution { - MIDGARD_PARTIAL_EXECUTION_SKIP = 1, - MIDGARD_PARTIAL_EXECUTION_KILL = 2, - MIDGARD_PARTIAL_EXECUTION_NONE = 3 + MIDGARD_PARTIAL_EXECUTION_SKIP = 1, + MIDGARD_PARTIAL_EXECUTION_KILL = 2, + MIDGARD_PARTIAL_EXECUTION_NONE = 3 }; -typedef struct -__attribute__((__packed__)) -{ - unsigned type : 4; - unsigned next_type : 4; +typedef struct __attribute__((__packed__)) { + unsigned type : 4; + unsigned next_type : 4; - enum mali_texture_op op : 4; - unsigned mode : 4; - enum midgard_partial_execution exec : 2; + enum mali_texture_op op : 4; + unsigned mode : 4; + enum midgard_partial_execution exec : 2; - unsigned format : 2; + unsigned format : 2; - /* Are sampler_handle/texture_handler respectively set by registers? If - * true, the lower 8-bits of the respective field is a register word. - * If false, they are an immediate */ + /* Are sampler_handle/texture_handler respectively set by registers? If + * true, the lower 8-bits of the respective field is a register word. + * If false, they are an immediate */ - unsigned sampler_register : 1; - unsigned texture_register : 1; + unsigned sampler_register : 1; + unsigned texture_register : 1; - /* Is a register used to specify the - * LOD/bias/offset? If set, use the `bias` field as - * a register index. If clear, use the `bias` field - * as an immediate. */ - unsigned lod_register : 1; + /* Is a register used to specify the + * LOD/bias/offset? If set, use the `bias` field as + * a register index. If clear, use the `bias` field + * as an immediate. */ + unsigned lod_register : 1; - /* Is a register used to specify an offset? If set, use the - * offset_reg_* fields to encode this, duplicated for each of the - * components. If clear, there is implcitly always an immediate offst - * specificed in offset_imm_* */ - unsigned offset_register : 1; + /* Is a register used to specify an offset? If set, use the + * offset_reg_* fields to encode this, duplicated for each of the + * components. If clear, there is implcitly always an immediate offst + * specificed in offset_imm_* */ + unsigned offset_register : 1; - unsigned in_reg_full : 1; - unsigned in_reg_select : 1; - unsigned in_reg_upper : 1; - unsigned in_reg_swizzle : 8; + unsigned in_reg_full : 1; + unsigned in_reg_select : 1; + unsigned in_reg_upper : 1; + unsigned in_reg_swizzle : 8; - unsigned unknown8 : 2; + unsigned unknown8 : 2; - unsigned out_full : 1; + unsigned out_full : 1; - enum mali_sampler_type sampler_type : 2; + enum mali_sampler_type sampler_type : 2; - unsigned out_reg_select : 1; - unsigned out_upper : 1; + unsigned out_reg_select : 1; + unsigned out_upper : 1; - unsigned mask : 4; + unsigned mask : 4; - /* Intriguingly, textures can take an outmod just like alu ops. Int - * outmods are not supported as far as I can tell, so this is only - * meaningful for float samplers */ - midgard_outmod_float outmod : 2; + /* Intriguingly, textures can take an outmod just like alu ops. Int + * outmods are not supported as far as I can tell, so this is only + * meaningful for float samplers */ + midgard_outmod_float outmod : 2; - unsigned swizzle : 8; + unsigned swizzle : 8; - /* These indicate how many bundles after this texture op may be - * executed in parallel with this op. We may execute only ALU and - * ld/st in parallel (not other textures), and obviously there cannot - * be any dependency (the blob appears to forbid even accessing other - * channels of a given texture register). */ + /* These indicate how many bundles after this texture op may be + * executed in parallel with this op. We may execute only ALU and + * ld/st in parallel (not other textures), and obviously there cannot + * be any dependency (the blob appears to forbid even accessing other + * channels of a given texture register). */ - unsigned out_of_order : 4; - unsigned unknown4 : 8; + unsigned out_of_order : 4; + unsigned unknown4 : 8; - /* In immediate mode, each offset field is an immediate range [0, 7]. - * - * In register mode, offset_x becomes a register (full, select, upper) - * triplet followed by a vec3 swizzle is splattered across - * offset_y/offset_z in a genuinely bizarre way. - * - * For texel fetches in immediate mode, the range is the full [-8, 7], - * but for normal texturing the top bit must be zero and a register - * used instead. It's not clear where this limitation is from. - * - * union { - * struct { - * signed offset_x : 4; - * signed offset_y : 4; - * signed offset_z : 4; - * } immediate; - * struct { - * bool full : 1; - * bool select : 1; - * bool upper : 1; - * unsigned swizzle : 8; - * unsigned zero : 1; - * } register; - * } - */ + /* In immediate mode, each offset field is an immediate range [0, 7]. + * + * In register mode, offset_x becomes a register (full, select, upper) + * triplet followed by a vec3 swizzle is splattered across + * offset_y/offset_z in a genuinely bizarre way. + * + * For texel fetches in immediate mode, the range is the full [-8, 7], + * but for normal texturing the top bit must be zero and a register + * used instead. It's not clear where this limitation is from. + * + * union { + * struct { + * signed offset_x : 4; + * signed offset_y : 4; + * signed offset_z : 4; + * } immediate; + * struct { + * bool full : 1; + * bool select : 1; + * bool upper : 1; + * unsigned swizzle : 8; + * unsigned zero : 1; + * } register; + * } + */ - unsigned offset : 12; + unsigned offset : 12; - /* In immediate bias mode, for a normal texture op, this is - * texture bias, computed as int(2^8 * frac(biasf)), with - * bias_int = floor(bias). For a textureLod, it's that, but - * s/bias/lod. For a texel fetch, this is the LOD as-is. - * - * In register mode, this is a midgard_tex_register_select - * structure and bias_int is zero */ + /* In immediate bias mode, for a normal texture op, this is + * texture bias, computed as int(2^8 * frac(biasf)), with + * bias_int = floor(bias). For a textureLod, it's that, but + * s/bias/lod. For a texel fetch, this is the LOD as-is. + * + * In register mode, this is a midgard_tex_register_select + * structure and bias_int is zero */ - unsigned bias : 8; - signed bias_int : 8; + unsigned bias : 8; + signed bias_int : 8; - /* If sampler/texture_register is set, the bottom 8-bits are - * midgard_tex_register_select and the top 8-bits are zero. If they are - * clear, they are immediate texture indices */ + /* If sampler/texture_register is set, the bottom 8-bits are + * midgard_tex_register_select and the top 8-bits are zero. If they are + * clear, they are immediate texture indices */ - unsigned sampler_handle : 16; - unsigned texture_handle : 16; -} -midgard_texture_word; + unsigned sampler_handle : 16; + unsigned texture_handle : 16; +} midgard_texture_word; /* Technically barriers are texture instructions but it's less work to add them * as an explicitly zeroed special case, since most fields are forced to go to * zero */ -typedef struct -__attribute__((__packed__)) -{ - unsigned type : 4; - unsigned next_type : 4; +typedef struct __attribute__((__packed__)) { + unsigned type : 4; + unsigned next_type : 4; - /* op = TEXTURE_OP_BARRIER */ - unsigned op : 6; - unsigned zero1 : 2; + /* op = TEXTURE_OP_BARRIER */ + unsigned op : 6; + unsigned zero1 : 2; - /* Since helper invocations don't make any sense, these are forced to one */ - unsigned cont : 1; - unsigned last : 1; - unsigned zero2 : 14; + /* Since helper invocations don't make any sense, these are forced to one */ + unsigned cont : 1; + unsigned last : 1; + unsigned zero2 : 14; - unsigned zero3 : 24; - unsigned out_of_order : 4; - unsigned zero4 : 4; + unsigned zero3 : 24; + unsigned out_of_order : 4; + unsigned zero4 : 4; - uint64_t zero5; + uint64_t zero5; } midgard_texture_barrier_word; typedef union midgard_constants { - double f64[2]; - uint64_t u64[2]; - int64_t i64[2]; - float f32[4]; - uint32_t u32[4]; - int32_t i32[4]; - uint16_t f16[8]; - uint16_t u16[8]; - int16_t i16[8]; - uint8_t u8[16]; - int8_t i8[16]; -} -midgard_constants; + double f64[2]; + uint64_t u64[2]; + int64_t i64[2]; + float f32[4]; + uint32_t u32[4]; + int32_t i32[4]; + uint16_t f16[8]; + uint16_t u16[8]; + int16_t i16[8]; + uint8_t u8[16]; + int8_t i8[16]; +} midgard_constants; enum midgard_roundmode { - MIDGARD_RTE = 0x0, /* round to even */ - MIDGARD_RTZ = 0x1, /* round to zero */ - MIDGARD_RTN = 0x2, /* round to negative */ - MIDGARD_RTP = 0x3, /* round to positive */ + MIDGARD_RTE = 0x0, /* round to even */ + MIDGARD_RTZ = 0x1, /* round to zero */ + MIDGARD_RTN = 0x2, /* round to negative */ + MIDGARD_RTP = 0x3, /* round to positive */ }; #endif diff --git a/src/panfrost/midgard/midgard_address.c b/src/panfrost/midgard/midgard_address.c index 8b80f042ad0..6b514e5aa61 100644 --- a/src/panfrost/midgard/midgard_address.c +++ b/src/panfrost/midgard/midgard_address.c @@ -33,32 +33,33 @@ * * A + (zext?(B) << #s) + #c * - * This allows for fast indexing into arrays. This file tries to pattern match the offset in NIR with this form to reduce pressure on the ALU pipe. + * This allows for fast indexing into arrays. This file tries to pattern match + * the offset in NIR with this form to reduce pressure on the ALU pipe. */ struct mir_address { - nir_ssa_scalar A; - nir_ssa_scalar B; + nir_ssa_scalar A; + nir_ssa_scalar B; - midgard_index_address_format type; - unsigned shift; - unsigned bias; + midgard_index_address_format type; + unsigned shift; + unsigned bias; }; static bool mir_args_ssa(nir_ssa_scalar s, unsigned count) { - nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr); + nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr); - if (count > nir_op_infos[alu->op].num_inputs) - return false; + if (count > nir_op_infos[alu->op].num_inputs) + return false; - for (unsigned i = 0; i < count; ++i) { - if (!alu->src[i].src.is_ssa) - return false; - } + for (unsigned i = 0; i < count; ++i) { + if (!alu->src[i].src.is_ssa) + return false; + } - return true; + return true; } /* Matches a constant in either slot and moves it to the bias */ @@ -66,15 +67,15 @@ mir_args_ssa(nir_ssa_scalar s, unsigned count) static void mir_match_constant(struct mir_address *address) { - if (address->A.def && nir_ssa_scalar_is_const(address->A)) { - address->bias += nir_ssa_scalar_as_uint(address->A); - address->A.def = NULL; - } + if (address->A.def && nir_ssa_scalar_is_const(address->A)) { + address->bias += nir_ssa_scalar_as_uint(address->A); + address->A.def = NULL; + } - if (address->B.def && nir_ssa_scalar_is_const(address->B)) { - address->bias += nir_ssa_scalar_as_uint(address->B); - address->B.def = NULL; - } + if (address->B.def && nir_ssa_scalar_is_const(address->B)) { + address->bias += nir_ssa_scalar_as_uint(address->B); + address->B.def = NULL; + } } /* Matches an iadd when there is a free slot or constant */ @@ -85,33 +86,33 @@ mir_match_constant(struct mir_address *address) static void mir_match_iadd(struct mir_address *address, bool first_free) { - if (!address->B.def || !nir_ssa_scalar_is_alu(address->B)) - return; + if (!address->B.def || !nir_ssa_scalar_is_alu(address->B)) + return; - if (!mir_args_ssa(address->B, 2)) - return; + if (!mir_args_ssa(address->B, 2)) + return; - nir_op op = nir_ssa_scalar_alu_op(address->B); + nir_op op = nir_ssa_scalar_alu_op(address->B); - if (op != nir_op_iadd) return; + if (op != nir_op_iadd) + return; - nir_ssa_scalar op1 = nir_ssa_scalar_chase_alu_src(address->B, 0); - nir_ssa_scalar op2 = nir_ssa_scalar_chase_alu_src(address->B, 1); + nir_ssa_scalar op1 = nir_ssa_scalar_chase_alu_src(address->B, 0); + nir_ssa_scalar op2 = nir_ssa_scalar_chase_alu_src(address->B, 1); - if (nir_ssa_scalar_is_const(op1) && - nir_ssa_scalar_as_uint(op1) <= MAX_POSITIVE_OFFSET) { - address->bias += nir_ssa_scalar_as_uint(op1); - address->B = op2; - } else if (nir_ssa_scalar_is_const(op2) && - nir_ssa_scalar_as_uint(op2) <= MAX_POSITIVE_OFFSET) { - address->bias += nir_ssa_scalar_as_uint(op2); - address->B = op1; - } else if (!nir_ssa_scalar_is_const(op1) && - !nir_ssa_scalar_is_const(op2) && - first_free && !address->A.def) { - address->A = op1; - address->B = op2; - } + if (nir_ssa_scalar_is_const(op1) && + nir_ssa_scalar_as_uint(op1) <= MAX_POSITIVE_OFFSET) { + address->bias += nir_ssa_scalar_as_uint(op1); + address->B = op2; + } else if (nir_ssa_scalar_is_const(op2) && + nir_ssa_scalar_as_uint(op2) <= MAX_POSITIVE_OFFSET) { + address->bias += nir_ssa_scalar_as_uint(op2); + address->B = op1; + } else if (!nir_ssa_scalar_is_const(op1) && !nir_ssa_scalar_is_const(op2) && + first_free && !address->A.def) { + address->A = op1; + address->B = op2; + } } /* Matches u2u64 and sets type */ @@ -119,18 +120,19 @@ mir_match_iadd(struct mir_address *address, bool first_free) static void mir_match_u2u64(struct mir_address *address) { - if (!address->B.def || !nir_ssa_scalar_is_alu(address->B)) - return; + if (!address->B.def || !nir_ssa_scalar_is_alu(address->B)) + return; - if (!mir_args_ssa(address->B, 1)) - return; + if (!mir_args_ssa(address->B, 1)) + return; - nir_op op = nir_ssa_scalar_alu_op(address->B); - if (op != nir_op_u2u64) return; - nir_ssa_scalar arg = nir_ssa_scalar_chase_alu_src(address->B, 0); + nir_op op = nir_ssa_scalar_alu_op(address->B); + if (op != nir_op_u2u64) + return; + nir_ssa_scalar arg = nir_ssa_scalar_chase_alu_src(address->B, 0); - address->B = arg; - address->type = midgard_index_address_u32; + address->B = arg; + address->type = midgard_index_address_u32; } /* Matches i2i64 and sets type */ @@ -138,18 +140,19 @@ mir_match_u2u64(struct mir_address *address) static void mir_match_i2i64(struct mir_address *address) { - if (!address->B.def || !nir_ssa_scalar_is_alu(address->B)) - return; + if (!address->B.def || !nir_ssa_scalar_is_alu(address->B)) + return; - if (!mir_args_ssa(address->B, 1)) - return; + if (!mir_args_ssa(address->B, 1)) + return; - nir_op op = nir_ssa_scalar_alu_op(address->B); - if (op != nir_op_i2i64) return; - nir_ssa_scalar arg = nir_ssa_scalar_chase_alu_src(address->B, 0); + nir_op op = nir_ssa_scalar_alu_op(address->B); + if (op != nir_op_i2i64) + return; + nir_ssa_scalar arg = nir_ssa_scalar_chase_alu_src(address->B, 0); - address->B = arg; - address->type = midgard_index_address_s32; + address->B = arg; + address->type = midgard_index_address_s32; } /* Matches ishl to shift */ @@ -157,24 +160,27 @@ mir_match_i2i64(struct mir_address *address) static void mir_match_ishl(struct mir_address *address) { - if (!address->B.def || !nir_ssa_scalar_is_alu(address->B)) - return; + if (!address->B.def || !nir_ssa_scalar_is_alu(address->B)) + return; - if (!mir_args_ssa(address->B, 2)) - return; + if (!mir_args_ssa(address->B, 2)) + return; - nir_op op = nir_ssa_scalar_alu_op(address->B); - if (op != nir_op_ishl) return; - nir_ssa_scalar op1 = nir_ssa_scalar_chase_alu_src(address->B, 0); - nir_ssa_scalar op2 = nir_ssa_scalar_chase_alu_src(address->B, 1); + nir_op op = nir_ssa_scalar_alu_op(address->B); + if (op != nir_op_ishl) + return; + nir_ssa_scalar op1 = nir_ssa_scalar_chase_alu_src(address->B, 0); + nir_ssa_scalar op2 = nir_ssa_scalar_chase_alu_src(address->B, 1); - if (!nir_ssa_scalar_is_const(op2)) return; + if (!nir_ssa_scalar_is_const(op2)) + return; - unsigned shift = nir_ssa_scalar_as_uint(op2); - if (shift > 0x7) return; + unsigned shift = nir_ssa_scalar_as_uint(op2); + if (shift > 0x7) + return; - address->B = op1; - address->shift = shift; + address->B = op1; + address->shift = shift; } /* Strings through mov which can happen from NIR vectorization */ @@ -182,19 +188,19 @@ mir_match_ishl(struct mir_address *address) static void mir_match_mov(struct mir_address *address) { - if (address->A.def && nir_ssa_scalar_is_alu(address->A)) { - nir_op op = nir_ssa_scalar_alu_op(address->A); + if (address->A.def && nir_ssa_scalar_is_alu(address->A)) { + nir_op op = nir_ssa_scalar_alu_op(address->A); - if (op == nir_op_mov && mir_args_ssa(address->A, 1)) - address->A = nir_ssa_scalar_chase_alu_src(address->A, 0); - } + if (op == nir_op_mov && mir_args_ssa(address->A, 1)) + address->A = nir_ssa_scalar_chase_alu_src(address->A, 0); + } - if (address->B.def && nir_ssa_scalar_is_alu(address->B)) { - nir_op op = nir_ssa_scalar_alu_op(address->B); + if (address->B.def && nir_ssa_scalar_is_alu(address->B)) { + nir_op op = nir_ssa_scalar_alu_op(address->B); - if (op == nir_op_mov && mir_args_ssa(address->B, 1)) - address->B = nir_ssa_scalar_chase_alu_src(address->B, 0); - } + if (op == nir_op_mov && mir_args_ssa(address->B, 1)) + address->B = nir_ssa_scalar_chase_alu_src(address->B, 0); + } } /* Tries to pattern match into mir_address */ @@ -202,105 +208,105 @@ mir_match_mov(struct mir_address *address) static struct mir_address mir_match_offset(nir_ssa_def *offset, bool first_free, bool extend) { - struct mir_address address = { - .B = { .def = offset }, - .type = extend ? midgard_index_address_u64 : midgard_index_address_u32, - }; + struct mir_address address = { + .B = {.def = offset}, + .type = extend ? midgard_index_address_u64 : midgard_index_address_u32, + }; - mir_match_mov(&address); - mir_match_constant(&address); - mir_match_mov(&address); - mir_match_iadd(&address, first_free); - mir_match_mov(&address); + mir_match_mov(&address); + mir_match_constant(&address); + mir_match_mov(&address); + mir_match_iadd(&address, first_free); + mir_match_mov(&address); - if (extend) { - mir_match_u2u64(&address); - mir_match_i2i64(&address); - mir_match_mov(&address); - } + if (extend) { + mir_match_u2u64(&address); + mir_match_i2i64(&address); + mir_match_mov(&address); + } - mir_match_ishl(&address); + mir_match_ishl(&address); - return address; + return address; } void -mir_set_offset(compiler_context *ctx, midgard_instruction *ins, nir_src *offset, unsigned seg) +mir_set_offset(compiler_context *ctx, midgard_instruction *ins, nir_src *offset, + unsigned seg) { - for(unsigned i = 0; i < 16; ++i) { - ins->swizzle[1][i] = 0; - ins->swizzle[2][i] = 0; - } + for (unsigned i = 0; i < 16; ++i) { + ins->swizzle[1][i] = 0; + ins->swizzle[2][i] = 0; + } - /* Sign extend instead of zero extend in case the address is something - * like `base + offset + 20`, where offset could be negative. */ - bool force_sext = (nir_src_bit_size(*offset) < 64); + /* Sign extend instead of zero extend in case the address is something + * like `base + offset + 20`, where offset could be negative. */ + bool force_sext = (nir_src_bit_size(*offset) < 64); - if (!offset->is_ssa) { - ins->load_store.bitsize_toggle = true; - ins->load_store.arg_comp = seg & 0x3; - ins->load_store.arg_reg = (seg >> 2) & 0x7; - ins->src[2] = nir_src_index(ctx, offset); - ins->src_types[2] = nir_type_uint | nir_src_bit_size(*offset); + if (!offset->is_ssa) { + ins->load_store.bitsize_toggle = true; + ins->load_store.arg_comp = seg & 0x3; + ins->load_store.arg_reg = (seg >> 2) & 0x7; + ins->src[2] = nir_src_index(ctx, offset); + ins->src_types[2] = nir_type_uint | nir_src_bit_size(*offset); - if (force_sext) - ins->load_store.index_format = midgard_index_address_s32; - else - ins->load_store.index_format = midgard_index_address_u64; + if (force_sext) + ins->load_store.index_format = midgard_index_address_s32; + else + ins->load_store.index_format = midgard_index_address_u64; - return; - } + return; + } - bool first_free = (seg == LDST_GLOBAL); + bool first_free = (seg == LDST_GLOBAL); - struct mir_address match = mir_match_offset(offset->ssa, first_free, true); + struct mir_address match = mir_match_offset(offset->ssa, first_free, true); - if (match.A.def) { - unsigned bitsize = match.A.def->bit_size; - assert(bitsize == 32 || bitsize == 64); + if (match.A.def) { + unsigned bitsize = match.A.def->bit_size; + assert(bitsize == 32 || bitsize == 64); - ins->src[1] = nir_ssa_index(match.A.def); - ins->swizzle[1][0] = match.A.comp; - ins->src_types[1] = nir_type_uint | bitsize; - ins->load_store.bitsize_toggle = (bitsize == 64); - } else { - ins->load_store.bitsize_toggle = true; - ins->load_store.arg_comp = seg & 0x3; - ins->load_store.arg_reg = (seg >> 2) & 0x7; - } + ins->src[1] = nir_ssa_index(match.A.def); + ins->swizzle[1][0] = match.A.comp; + ins->src_types[1] = nir_type_uint | bitsize; + ins->load_store.bitsize_toggle = (bitsize == 64); + } else { + ins->load_store.bitsize_toggle = true; + ins->load_store.arg_comp = seg & 0x3; + ins->load_store.arg_reg = (seg >> 2) & 0x7; + } - if (match.B.def) { - ins->src[2] = nir_ssa_index(match.B.def); - ins->swizzle[2][0] = match.B.comp; - ins->src_types[2] = nir_type_uint | match.B.def->bit_size; - } else - ins->load_store.index_reg = REGISTER_LDST_ZERO; + if (match.B.def) { + ins->src[2] = nir_ssa_index(match.B.def); + ins->swizzle[2][0] = match.B.comp; + ins->src_types[2] = nir_type_uint | match.B.def->bit_size; + } else + ins->load_store.index_reg = REGISTER_LDST_ZERO; - if (force_sext) - match.type = midgard_index_address_s32; + if (force_sext) + match.type = midgard_index_address_s32; - ins->load_store.index_format = match.type; + ins->load_store.index_format = match.type; - assert(match.shift <= 7); - ins->load_store.index_shift = match.shift; + assert(match.shift <= 7); + ins->load_store.index_shift = match.shift; - ins->constants.u32[0] = match.bias; + ins->constants.u32[0] = match.bias; } - void mir_set_ubo_offset(midgard_instruction *ins, nir_src *src, unsigned bias) { - assert(src->is_ssa); - struct mir_address match = mir_match_offset(src->ssa, false, false); + assert(src->is_ssa); + struct mir_address match = mir_match_offset(src->ssa, false, false); - if (match.B.def) { - ins->src[2] = nir_ssa_index(match.B.def); + if (match.B.def) { + ins->src[2] = nir_ssa_index(match.B.def); - for (unsigned i = 0; i < ARRAY_SIZE(ins->swizzle[2]); ++i) - ins->swizzle[2][i] = match.B.comp; - } + for (unsigned i = 0; i < ARRAY_SIZE(ins->swizzle[2]); ++i) + ins->swizzle[2][i] = match.B.comp; + } - ins->load_store.index_shift = match.shift; - ins->constants.u32[0] = match.bias + bias; + ins->load_store.index_shift = match.shift; + ins->constants.u32[0] = match.bias + bias; } diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index 10e05fbf454..c95bb2414a5 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -22,78 +22,78 @@ * SOFTWARE. */ -#include -#include -#include +#include #include #include -#include #include -#include +#include +#include +#include +#include #include "compiler/glsl/glsl_to_nir.h" -#include "compiler/nir_types.h" #include "compiler/nir/nir_builder.h" +#include "compiler/nir_types.h" #include "util/half_float.h" -#include "util/u_math.h" +#include "util/list.h" #include "util/u_debug.h" #include "util/u_dynarray.h" -#include "util/list.h" +#include "util/u_math.h" -#include "midgard.h" -#include "midgard_nir.h" -#include "midgard_compile.h" -#include "midgard_ops.h" -#include "helpers.h" -#include "compiler.h" -#include "midgard_quirks.h" #include "panfrost/util/pan_lower_framebuffer.h" +#include "compiler.h" +#include "helpers.h" +#include "midgard.h" +#include "midgard_compile.h" +#include "midgard_nir.h" +#include "midgard_ops.h" +#include "midgard_quirks.h" #include "disassemble.h" static const struct debug_named_value midgard_debug_options[] = { - {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"}, - {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"}, - {"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"}, - {"inorder", MIDGARD_DBG_INORDER, "Disables out-of-order scheduling"}, - {"verbose", MIDGARD_DBG_VERBOSE, "Dump shaders verbosely"}, - {"internal", MIDGARD_DBG_INTERNAL, "Dump internal shaders"}, - DEBUG_NAMED_VALUE_END -}; + {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"}, + {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"}, + {"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"}, + {"inorder", MIDGARD_DBG_INORDER, "Disables out-of-order scheduling"}, + {"verbose", MIDGARD_DBG_VERBOSE, "Dump shaders verbosely"}, + {"internal", MIDGARD_DBG_INTERNAL, "Dump internal shaders"}, + DEBUG_NAMED_VALUE_END}; -DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", midgard_debug_options, 0) +DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", + midgard_debug_options, 0) int midgard_debug = 0; -#define DBG(fmt, ...) \ - do { if (midgard_debug & MIDGARD_DBG_MSGS) \ - fprintf(stderr, "%s:%d: "fmt, \ - __func__, __LINE__, ##__VA_ARGS__); } while (0) +#define DBG(fmt, ...) \ + do { \ + if (midgard_debug & MIDGARD_DBG_MSGS) \ + fprintf(stderr, "%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__); \ + } while (0) static midgard_block * create_empty_block(compiler_context *ctx) { - midgard_block *blk = rzalloc(ctx, midgard_block); + midgard_block *blk = rzalloc(ctx, midgard_block); - blk->base.predecessors = _mesa_set_create(blk, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + blk->base.predecessors = + _mesa_set_create(blk, _mesa_hash_pointer, _mesa_key_pointer_equal); - blk->base.name = ctx->block_source_count++; + blk->base.name = ctx->block_source_count++; - return blk; + return blk; } static void schedule_barrier(compiler_context *ctx) { - midgard_block *temp = ctx->after_block; - ctx->after_block = create_empty_block(ctx); - ctx->block_count++; - list_addtail(&ctx->after_block->base.link, &ctx->blocks); - list_inithead(&ctx->after_block->base.instructions); - pan_block_add_successor(&ctx->current_block->base, &ctx->after_block->base); - ctx->current_block = ctx->after_block; - ctx->after_block = temp; + midgard_block *temp = ctx->after_block; + ctx->after_block = create_empty_block(ctx); + ctx->block_count++; + list_addtail(&ctx->after_block->base.link, &ctx->blocks); + list_inithead(&ctx->after_block->base.instructions); + pan_block_add_successor(&ctx->current_block->base, &ctx->after_block->base); + ctx->current_block = ctx->after_block; + ctx->after_block = temp; } /* Helpers to generate midgard_instruction's using macro magic, since every @@ -101,32 +101,34 @@ schedule_barrier(compiler_context *ctx) #define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__)); -#define M_LOAD_STORE(name, store, T) \ - static midgard_instruction m_##name(unsigned ssa, unsigned address) { \ - midgard_instruction i = { \ - .type = TAG_LOAD_STORE_4, \ - .mask = 0xF, \ - .dest = ~0, \ - .src = { ~0, ~0, ~0, ~0 }, \ - .swizzle = SWIZZLE_IDENTITY_4, \ - .op = midgard_op_##name, \ - .load_store = { \ - .signed_offset = address, \ - }, \ - }; \ - \ - if (store) { \ - i.src[0] = ssa; \ - i.src_types[0] = T; \ - i.dest_type = T; \ - } else { \ - i.dest = ssa; \ - i.dest_type = T; \ - } \ - return i; \ - } +#define M_LOAD_STORE(name, store, T) \ + static midgard_instruction m_##name(unsigned ssa, unsigned address) \ + { \ + midgard_instruction i = { \ + .type = TAG_LOAD_STORE_4, \ + .mask = 0xF, \ + .dest = ~0, \ + .src = {~0, ~0, ~0, ~0}, \ + .swizzle = SWIZZLE_IDENTITY_4, \ + .op = midgard_op_##name, \ + .load_store = \ + { \ + .signed_offset = address, \ + }, \ + }; \ + \ + if (store) { \ + i.src[0] = ssa; \ + i.src_types[0] = T; \ + i.dest_type = T; \ + } else { \ + i.dest = ssa; \ + i.dest_type = T; \ + } \ + return i; \ + } -#define M_LOAD(name, T) M_LOAD_STORE(name, false, T) +#define M_LOAD(name, T) M_LOAD_STORE(name, false, T) #define M_STORE(name, T) M_LOAD_STORE(name, true, T) M_LOAD(ld_attr_32, nir_type_uint32); @@ -162,23 +164,23 @@ M_STORE(st_image_32u, nir_type_uint32); M_STORE(st_image_32i, nir_type_int32); M_LOAD(lea_image, nir_type_uint64); -#define M_IMAGE(op) \ -static midgard_instruction \ -op ## _image(nir_alu_type type, unsigned val, unsigned address) \ -{ \ - switch (type) { \ - case nir_type_float32: \ - return m_ ## op ## _image_32f(val, address); \ - case nir_type_float16: \ - return m_ ## op ## _image_16f(val, address); \ - case nir_type_uint32: \ - return m_ ## op ## _image_32u(val, address); \ - case nir_type_int32: \ - return m_ ## op ## _image_32i(val, address); \ - default: \ - unreachable("Invalid image type"); \ - } \ -} +#define M_IMAGE(op) \ + static midgard_instruction op##_image(nir_alu_type type, unsigned val, \ + unsigned address) \ + { \ + switch (type) { \ + case nir_type_float32: \ + return m_##op##_image_32f(val, address); \ + case nir_type_float16: \ + return m_##op##_image_16f(val, address); \ + case nir_type_uint32: \ + return m_##op##_image_32u(val, address); \ + case nir_type_int32: \ + return m_##op##_image_32i(val, address); \ + default: \ + unreachable("Invalid image type"); \ + } \ + } M_IMAGE(ld); M_IMAGE(st); @@ -186,284 +188,280 @@ M_IMAGE(st); static midgard_instruction v_branch(bool conditional, bool invert) { - midgard_instruction ins = { - .type = TAG_ALU_4, - .unit = ALU_ENAB_BRANCH, - .compact_branch = true, - .branch = { - .conditional = conditional, - .invert_conditional = invert, - }, - .dest = ~0, - .src = { ~0, ~0, ~0, ~0 }, - }; + midgard_instruction ins = { + .type = TAG_ALU_4, + .unit = ALU_ENAB_BRANCH, + .compact_branch = true, + .branch = + { + .conditional = conditional, + .invert_conditional = invert, + }, + .dest = ~0, + .src = {~0, ~0, ~0, ~0}, + }; - return ins; + return ins; } static void -attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name) +attach_constants(compiler_context *ctx, midgard_instruction *ins, + void *constants, int name) { - ins->has_constants = true; - memcpy(&ins->constants, constants, 16); + ins->has_constants = true; + memcpy(&ins->constants, constants, 16); } static int glsl_type_size(const struct glsl_type *type, bool bindless) { - return glsl_count_attribute_slots(type, false); + return glsl_count_attribute_slots(type, false); } static bool -midgard_nir_lower_global_load_instr(nir_builder *b, nir_instr *instr, void *data) +midgard_nir_lower_global_load_instr(nir_builder *b, nir_instr *instr, + void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_load_global && - intr->intrinsic != nir_intrinsic_load_shared) - return false; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_load_global && + intr->intrinsic != nir_intrinsic_load_shared) + return false; - unsigned compsz = nir_dest_bit_size(intr->dest); - unsigned totalsz = compsz * nir_dest_num_components(intr->dest); - /* 8, 16, 32, 64 and 128 bit loads don't need to be lowered */ - if (util_bitcount(totalsz) < 2 && totalsz <= 128) - return false; + unsigned compsz = nir_dest_bit_size(intr->dest); + unsigned totalsz = compsz * nir_dest_num_components(intr->dest); + /* 8, 16, 32, 64 and 128 bit loads don't need to be lowered */ + if (util_bitcount(totalsz) < 2 && totalsz <= 128) + return false; - b->cursor = nir_before_instr(instr); + b->cursor = nir_before_instr(instr); - assert(intr->src[0].is_ssa); - nir_ssa_def *addr = intr->src[0].ssa; + assert(intr->src[0].is_ssa); + nir_ssa_def *addr = intr->src[0].ssa; - nir_ssa_def *comps[MIR_VEC_COMPONENTS]; - unsigned ncomps = 0; + nir_ssa_def *comps[MIR_VEC_COMPONENTS]; + unsigned ncomps = 0; - while (totalsz) { - unsigned loadsz = MIN2(1 << (util_last_bit(totalsz) - 1), 128); - unsigned loadncomps = loadsz / compsz; + while (totalsz) { + unsigned loadsz = MIN2(1 << (util_last_bit(totalsz) - 1), 128); + unsigned loadncomps = loadsz / compsz; - nir_ssa_def *load; - if (intr->intrinsic == nir_intrinsic_load_global) { - load = nir_load_global(b, addr, compsz / 8, loadncomps, compsz); - } else { - assert(intr->intrinsic == nir_intrinsic_load_shared); - nir_intrinsic_instr *shared_load = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_shared); - shared_load->num_components = loadncomps; - shared_load->src[0] = nir_src_for_ssa(addr); - nir_intrinsic_set_align(shared_load, compsz / 8, 0); - nir_intrinsic_set_base(shared_load, nir_intrinsic_base(intr)); - nir_ssa_dest_init(&shared_load->instr, &shared_load->dest, - shared_load->num_components, compsz, NULL); - nir_builder_instr_insert(b, &shared_load->instr); - load = &shared_load->dest.ssa; - } + nir_ssa_def *load; + if (intr->intrinsic == nir_intrinsic_load_global) { + load = nir_load_global(b, addr, compsz / 8, loadncomps, compsz); + } else { + assert(intr->intrinsic == nir_intrinsic_load_shared); + nir_intrinsic_instr *shared_load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_shared); + shared_load->num_components = loadncomps; + shared_load->src[0] = nir_src_for_ssa(addr); + nir_intrinsic_set_align(shared_load, compsz / 8, 0); + nir_intrinsic_set_base(shared_load, nir_intrinsic_base(intr)); + nir_ssa_dest_init(&shared_load->instr, &shared_load->dest, + shared_load->num_components, compsz, NULL); + nir_builder_instr_insert(b, &shared_load->instr); + load = &shared_load->dest.ssa; + } - for (unsigned i = 0; i < loadncomps; i++) - comps[ncomps++] = nir_channel(b, load, i); + for (unsigned i = 0; i < loadncomps; i++) + comps[ncomps++] = nir_channel(b, load, i); - totalsz -= loadsz; - addr = nir_iadd(b, addr, nir_imm_intN_t(b, loadsz / 8, addr->bit_size)); - } + totalsz -= loadsz; + addr = nir_iadd(b, addr, nir_imm_intN_t(b, loadsz / 8, addr->bit_size)); + } - assert(ncomps == nir_dest_num_components(intr->dest)); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_vec(b, comps, ncomps)); + assert(ncomps == nir_dest_num_components(intr->dest)); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_vec(b, comps, ncomps)); - return true; + return true; } static bool midgard_nir_lower_global_load(nir_shader *shader) { - return nir_shader_instructions_pass(shader, - midgard_nir_lower_global_load_instr, - nir_metadata_block_index | nir_metadata_dominance, - NULL); + return nir_shader_instructions_pass( + shader, midgard_nir_lower_global_load_instr, + nir_metadata_block_index | nir_metadata_dominance, NULL); } static bool mdg_should_scalarize(const nir_instr *instr, const void *_unused) { - const nir_alu_instr *alu = nir_instr_as_alu(instr); + const nir_alu_instr *alu = nir_instr_as_alu(instr); - if (nir_src_bit_size(alu->src[0].src) == 64) - return true; + if (nir_src_bit_size(alu->src[0].src) == 64) + return true; - if (nir_dest_bit_size(alu->dest.dest) == 64) - return true; + if (nir_dest_bit_size(alu->dest.dest) == 64) + return true; - switch (alu->op) { - case nir_op_fdot2: - case nir_op_umul_high: - case nir_op_imul_high: - case nir_op_pack_half_2x16: - case nir_op_unpack_half_2x16: - return true; - default: - return false; - } + switch (alu->op) { + case nir_op_fdot2: + case nir_op_umul_high: + case nir_op_imul_high: + case nir_op_pack_half_2x16: + case nir_op_unpack_half_2x16: + return true; + default: + return false; + } } /* Only vectorize int64 up to vec2 */ static uint8_t midgard_vectorize_filter(const nir_instr *instr, const void *data) { - if (instr->type != nir_instr_type_alu) - return 0; + if (instr->type != nir_instr_type_alu) + return 0; - const nir_alu_instr *alu = nir_instr_as_alu(instr); - int src_bit_size = nir_src_bit_size(alu->src[0].src); - int dst_bit_size = nir_dest_bit_size(alu->dest.dest); + const nir_alu_instr *alu = nir_instr_as_alu(instr); + int src_bit_size = nir_src_bit_size(alu->src[0].src); + int dst_bit_size = nir_dest_bit_size(alu->dest.dest); - if (src_bit_size == 64 || dst_bit_size == 64) - return 2; + if (src_bit_size == 64 || dst_bit_size == 64) + return 2; - return 4; + return 4; } static void optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend, bool is_blit) { - bool progress; - unsigned lower_flrp = - (nir->options->lower_flrp16 ? 16 : 0) | - (nir->options->lower_flrp32 ? 32 : 0) | - (nir->options->lower_flrp64 ? 64 : 0); + bool progress; + unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) | + (nir->options->lower_flrp32 ? 32 : 0) | + (nir->options->lower_flrp64 ? 64 : 0); - NIR_PASS(progress, nir, nir_lower_regs_to_ssa); - nir_lower_idiv_options idiv_options = { - .allow_fp16 = true, - }; - NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); + NIR_PASS(progress, nir, nir_lower_regs_to_ssa); + nir_lower_idiv_options idiv_options = { + .allow_fp16 = true, + }; + NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); - nir_lower_tex_options lower_tex_options = { - .lower_txs_lod = true, - .lower_txp = ~0, - .lower_tg4_broadcom_swizzle = true, - /* TODO: we have native gradient.. */ - .lower_txd = true, - .lower_invalid_implicit_lod = true, - }; + nir_lower_tex_options lower_tex_options = { + .lower_txs_lod = true, + .lower_txp = ~0, + .lower_tg4_broadcom_swizzle = true, + /* TODO: we have native gradient.. */ + .lower_txd = true, + .lower_invalid_implicit_lod = true, + }; - NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); + NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); + /* TEX_GRAD fails to apply sampler descriptor settings on some + * implementations, requiring a lowering. However, blit shaders do not + * use the affected settings and should skip the workaround. + */ + if ((quirks & MIDGARD_BROKEN_LOD) && !is_blit) + NIR_PASS_V(nir, midgard_nir_lod_errata); - /* TEX_GRAD fails to apply sampler descriptor settings on some - * implementations, requiring a lowering. However, blit shaders do not - * use the affected settings and should skip the workaround. - */ - if ((quirks & MIDGARD_BROKEN_LOD) && !is_blit) - NIR_PASS_V(nir, midgard_nir_lod_errata); + /* Midgard image ops coordinates are 16-bit instead of 32-bit */ + NIR_PASS(progress, nir, midgard_nir_lower_image_bitsize); + NIR_PASS(progress, nir, midgard_nir_lower_helper_writes); + NIR_PASS(progress, nir, pan_lower_helper_invocation); + NIR_PASS(progress, nir, pan_lower_sample_pos); - /* Midgard image ops coordinates are 16-bit instead of 32-bit */ - NIR_PASS(progress, nir, midgard_nir_lower_image_bitsize); - NIR_PASS(progress, nir, midgard_nir_lower_helper_writes); - NIR_PASS(progress, nir, pan_lower_helper_invocation); - NIR_PASS(progress, nir, pan_lower_sample_pos); + if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) { + NIR_PASS_V(nir, nir_io_add_const_offset_to_base, + nir_var_shader_in | nir_var_shader_out); + NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info); + NIR_PASS_V(nir, pan_lower_xfb); + } - if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) { - NIR_PASS_V(nir, nir_io_add_const_offset_to_base, - nir_var_shader_in | nir_var_shader_out); - NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info); - NIR_PASS_V(nir, pan_lower_xfb); - } + NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early); + NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL); - NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early); - NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL); + do { + progress = false; - do { - progress = false; + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_lower_var_copies); - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_remove_phis); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_remove_phis); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_dead_cf); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_constant_folding); + if (lower_flrp != 0) { + bool lower_flrp_progress = false; + NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, + false /* always_precise */); + if (lower_flrp_progress) { + NIR_PASS(progress, nir, nir_opt_constant_folding); + progress = true; + } - if (lower_flrp != 0) { - bool lower_flrp_progress = false; - NIR_PASS(lower_flrp_progress, - nir, - nir_lower_flrp, - lower_flrp, - false /* always_precise */); - if (lower_flrp_progress) { - NIR_PASS(progress, nir, - nir_opt_constant_folding); - progress = true; - } + /* Nothing should rematerialize any flrps, so we only + * need to do this lowering once. + */ + lower_flrp = 0; + } - /* Nothing should rematerialize any flrps, so we only - * need to do this lowering once. - */ - lower_flrp = 0; - } + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_lower_undef_to_zero); - NIR_PASS(progress, nir, nir_opt_undef); - NIR_PASS(progress, nir, nir_lower_undef_to_zero); + NIR_PASS(progress, nir, nir_opt_loop_unroll); - NIR_PASS(progress, nir, nir_opt_loop_unroll); + NIR_PASS(progress, nir, nir_opt_vectorize, midgard_vectorize_filter, + NULL); + } while (progress); - NIR_PASS(progress, nir, nir_opt_vectorize, - midgard_vectorize_filter, NULL); - } while (progress); + NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL); - NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL); + /* Run after opts so it can hit more */ + if (!is_blend) + NIR_PASS(progress, nir, nir_fuse_io_16); - /* Run after opts so it can hit more */ - if (!is_blend) - NIR_PASS(progress, nir, nir_fuse_io_16); + /* Must be run at the end to prevent creation of fsin/fcos ops */ + NIR_PASS(progress, nir, midgard_nir_scale_trig); - /* Must be run at the end to prevent creation of fsin/fcos ops */ - NIR_PASS(progress, nir, midgard_nir_scale_trig); + do { + progress = false; - do { - progress = false; + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_copy_prop); + } while (progress); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_copy_prop); - } while (progress); + NIR_PASS(progress, nir, nir_opt_algebraic_late); + NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods); - NIR_PASS(progress, nir, nir_opt_algebraic_late); - NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods); + /* We implement booleans as 32-bit 0/~0 */ + NIR_PASS(progress, nir, nir_lower_bool_to_int32); - /* We implement booleans as 32-bit 0/~0 */ - NIR_PASS(progress, nir, nir_lower_bool_to_int32); + /* Now that booleans are lowered, we can run out late opts */ + NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late); + NIR_PASS(progress, nir, midgard_nir_cancel_inot); - /* Now that booleans are lowered, we can run out late opts */ - NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late); - NIR_PASS(progress, nir, midgard_nir_cancel_inot); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); + /* Backend scheduler is purely local, so do some global optimizations + * to reduce register pressure. */ + nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | + nir_move_load_input | nir_move_comparisons | + nir_move_copies | nir_move_load_ssbo; - /* Backend scheduler is purely local, so do some global optimizations - * to reduce register pressure. */ - nir_move_options move_all = - nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | - nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; + NIR_PASS_V(nir, nir_opt_sink, move_all); + NIR_PASS_V(nir, nir_opt_move, move_all); - NIR_PASS_V(nir, nir_opt_sink, move_all); - NIR_PASS_V(nir, nir_opt_move, move_all); + /* Take us out of SSA */ + NIR_PASS(progress, nir, nir_lower_locals_to_regs); + NIR_PASS(progress, nir, nir_convert_from_ssa, true); - /* Take us out of SSA */ - NIR_PASS(progress, nir, nir_lower_locals_to_regs); - NIR_PASS(progress, nir, nir_convert_from_ssa, true); + /* We are a vector architecture; write combine where possible */ + NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest); + NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL); - /* We are a vector architecture; write combine where possible */ - NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest); - NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL); - - NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_dce); } /* Do not actually emit a load; instead, cache the constant for inlining */ @@ -471,35 +469,37 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend, bool is_blit) static void emit_load_const(compiler_context *ctx, nir_load_const_instr *instr) { - nir_ssa_def def = instr->def; + nir_ssa_def def = instr->def; - midgard_constants *consts = rzalloc(ctx, midgard_constants); + midgard_constants *consts = rzalloc(ctx, midgard_constants); - assert(instr->def.num_components * instr->def.bit_size <= sizeof(*consts) * 8); + assert(instr->def.num_components * instr->def.bit_size <= + sizeof(*consts) * 8); -#define RAW_CONST_COPY(bits) \ - nir_const_value_to_array(consts->u##bits, instr->value, \ - instr->def.num_components, u##bits) +#define RAW_CONST_COPY(bits) \ + nir_const_value_to_array(consts->u##bits, instr->value, \ + instr->def.num_components, u##bits) - switch (instr->def.bit_size) { - case 64: - RAW_CONST_COPY(64); - break; - case 32: - RAW_CONST_COPY(32); - break; - case 16: - RAW_CONST_COPY(16); - break; - case 8: - RAW_CONST_COPY(8); - break; - default: - unreachable("Invalid bit_size for load_const instruction\n"); - } + switch (instr->def.bit_size) { + case 64: + RAW_CONST_COPY(64); + break; + case 32: + RAW_CONST_COPY(32); + break; + case 16: + RAW_CONST_COPY(16); + break; + case 8: + RAW_CONST_COPY(8); + break; + default: + unreachable("Invalid bit_size for load_const instruction\n"); + } - /* Shifted for SSA, +1 for off-by-one */ - _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, consts); + /* Shifted for SSA, +1 for off-by-one */ + _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, + consts); } /* Normally constants are embedded implicitly, but for I/O and such we have to @@ -508,100 +508,102 @@ emit_load_const(compiler_context *ctx, nir_load_const_instr *instr) static void emit_explicit_constant(compiler_context *ctx, unsigned node, unsigned to) { - void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1); + void *constant_value = + _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1); - if (constant_value) { - midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to); - attach_constants(ctx, &ins, constant_value, node + 1); - emit_mir_instruction(ctx, ins); - } + if (constant_value) { + midgard_instruction ins = + v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to); + attach_constants(ctx, &ins, constant_value, node + 1); + emit_mir_instruction(ctx, ins); + } } static bool nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components) { - unsigned comp = src->swizzle[0]; + unsigned comp = src->swizzle[0]; - for (unsigned c = 1; c < nr_components; ++c) { - if (src->swizzle[c] != comp) - return true; - } + for (unsigned c = 1; c < nr_components; ++c) { + if (src->swizzle[c] != comp) + return true; + } - return false; + return false; } -#define ATOMIC_CASE_IMPL(ctx, instr, nir, op, is_shared) \ - case nir_intrinsic_##nir: \ - emit_atomic(ctx, instr, is_shared, midgard_op_##op, ~0); \ - break; +#define ATOMIC_CASE_IMPL(ctx, instr, nir, op, is_shared) \ + case nir_intrinsic_##nir: \ + emit_atomic(ctx, instr, is_shared, midgard_op_##op, ~0); \ + break; -#define ATOMIC_CASE(ctx, instr, nir, op) \ - ATOMIC_CASE_IMPL(ctx, instr, shared_atomic_##nir, atomic_##op, true); \ - ATOMIC_CASE_IMPL(ctx, instr, global_atomic_##nir, atomic_##op, false); +#define ATOMIC_CASE(ctx, instr, nir, op) \ + ATOMIC_CASE_IMPL(ctx, instr, shared_atomic_##nir, atomic_##op, true); \ + ATOMIC_CASE_IMPL(ctx, instr, global_atomic_##nir, atomic_##op, false); -#define IMAGE_ATOMIC_CASE(ctx, instr, nir, op) \ - case nir_intrinsic_image_atomic_##nir: { \ - midgard_instruction ins = emit_image_op(ctx, instr, true); \ - emit_atomic(ctx, instr, false, midgard_op_atomic_##op, ins.dest); \ - break; \ - } +#define IMAGE_ATOMIC_CASE(ctx, instr, nir, op) \ + case nir_intrinsic_image_atomic_##nir: { \ + midgard_instruction ins = emit_image_op(ctx, instr, true); \ + emit_atomic(ctx, instr, false, midgard_op_atomic_##op, ins.dest); \ + break; \ + } -#define ALU_CASE(nir, _op) \ - case nir_op_##nir: \ - op = midgard_alu_op_##_op; \ - assert(src_bitsize == dst_bitsize); \ - break; +#define ALU_CASE(nir, _op) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + assert(src_bitsize == dst_bitsize); \ + break; -#define ALU_CASE_RTZ(nir, _op) \ - case nir_op_##nir: \ - op = midgard_alu_op_##_op; \ - roundmode = MIDGARD_RTZ; \ - break; +#define ALU_CASE_RTZ(nir, _op) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + roundmode = MIDGARD_RTZ; \ + break; -#define ALU_CHECK_CMP() \ - assert(src_bitsize == 16 || src_bitsize == 32 || src_bitsize == 64); \ - assert(dst_bitsize == 16 || dst_bitsize == 32); \ +#define ALU_CHECK_CMP() \ + assert(src_bitsize == 16 || src_bitsize == 32 || src_bitsize == 64); \ + assert(dst_bitsize == 16 || dst_bitsize == 32); -#define ALU_CASE_BCAST(nir, _op, count) \ - case nir_op_##nir: \ - op = midgard_alu_op_##_op; \ - broadcast_swizzle = count; \ - ALU_CHECK_CMP(); \ - break; +#define ALU_CASE_BCAST(nir, _op, count) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + broadcast_swizzle = count; \ + ALU_CHECK_CMP(); \ + break; -#define ALU_CASE_CMP(nir, _op) \ - case nir_op_##nir: \ - op = midgard_alu_op_##_op; \ - ALU_CHECK_CMP(); \ - break; +#define ALU_CASE_CMP(nir, _op) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + ALU_CHECK_CMP(); \ + break; /* Compare mir_lower_invert */ static bool nir_accepts_inot(nir_op op, unsigned src) { - switch (op) { - case nir_op_ior: - case nir_op_iand: /* TODO: b2f16 */ - case nir_op_ixor: - return true; - case nir_op_b32csel: - /* Only the condition */ - return (src == 0); - default: - return false; - } + switch (op) { + case nir_op_ior: + case nir_op_iand: /* TODO: b2f16 */ + case nir_op_ixor: + return true; + case nir_op_b32csel: + /* Only the condition */ + return (src == 0); + default: + return false; + } } static bool mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op) { - if (pan_has_dest_mod(dest, op)) { - assert((*dest)->is_ssa); - BITSET_SET(ctx->already_emitted, (*dest)->ssa.index); - return true; - } + if (pan_has_dest_mod(dest, op)) { + assert((*dest)->is_ssa); + BITSET_SET(ctx->already_emitted, (*dest)->ssa.index); + return true; + } - return false; + return false; } /* Look for floating point mods. We have the mods clamp_m1_1, clamp_0_1, @@ -618,77 +620,82 @@ mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op) * clamp_m1_1 alone. */ static unsigned -mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod) +mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, + unsigned prior_outmod) { - bool clamp_0_inf = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos_mali); - bool clamp_0_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat); - bool clamp_m1_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed_mali); - bool prior = (prior_outmod != midgard_outmod_none); - int count = (int) prior + (int) clamp_0_inf + (int) clamp_0_1 + (int) clamp_m1_1; + bool clamp_0_inf = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos_mali); + bool clamp_0_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat); + bool clamp_m1_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed_mali); + bool prior = (prior_outmod != midgard_outmod_none); + int count = (int)prior + (int)clamp_0_inf + (int)clamp_0_1 + (int)clamp_m1_1; - return ((count > 1) || clamp_0_1) ? midgard_outmod_clamp_0_1 : - clamp_0_inf ? midgard_outmod_clamp_0_inf : - clamp_m1_1 ? midgard_outmod_clamp_m1_1 : - prior_outmod; + return ((count > 1) || clamp_0_1) ? midgard_outmod_clamp_0_1 + : clamp_0_inf ? midgard_outmod_clamp_0_inf + : clamp_m1_1 ? midgard_outmod_clamp_m1_1 + : prior_outmod; } static void -mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count) +mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, + unsigned to, bool *abs, bool *neg, bool * not, + enum midgard_roundmode *roundmode, bool is_int, + unsigned bcast_count) { - nir_alu_src src = instr->src[i]; + nir_alu_src src = instr->src[i]; - if (!is_int) { - if (pan_has_source_mod(&src, nir_op_fneg)) - *neg = !(*neg); + if (!is_int) { + if (pan_has_source_mod(&src, nir_op_fneg)) + *neg = !(*neg); - if (pan_has_source_mod(&src, nir_op_fabs)) - *abs = true; - } + if (pan_has_source_mod(&src, nir_op_fabs)) + *abs = true; + } - if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot)) - *not = true; + if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot)) + *not = true; - if (roundmode) { - if (pan_has_source_mod(&src, nir_op_fround_even)) - *roundmode = MIDGARD_RTE; + if (roundmode) { + if (pan_has_source_mod(&src, nir_op_fround_even)) + *roundmode = MIDGARD_RTE; - if (pan_has_source_mod(&src, nir_op_ftrunc)) - *roundmode = MIDGARD_RTZ; + if (pan_has_source_mod(&src, nir_op_ftrunc)) + *roundmode = MIDGARD_RTZ; - if (pan_has_source_mod(&src, nir_op_ffloor)) - *roundmode = MIDGARD_RTN; + if (pan_has_source_mod(&src, nir_op_ffloor)) + *roundmode = MIDGARD_RTN; - if (pan_has_source_mod(&src, nir_op_fceil)) - *roundmode = MIDGARD_RTP; - } + if (pan_has_source_mod(&src, nir_op_fceil)) + *roundmode = MIDGARD_RTP; + } - unsigned bits = nir_src_bit_size(src.src); + unsigned bits = nir_src_bit_size(src.src); - ins->src[to] = nir_src_index(NULL, &src.src); - ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits; + ins->src[to] = nir_src_index(NULL, &src.src); + ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits; - /* Figure out which component we should fill unused channels with. This - * doesn't matter too much in the non-broadcast case, but it makes - * should that scalar sources are packed with replicated swizzles, - * which works around issues seen with the combination of source - * expansion and destination shrinking. - */ - unsigned replicate_c = 0; - if (bcast_count) { - replicate_c = bcast_count - 1; - } else { - for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) { - if (nir_alu_instr_channel_used(instr, i, c)) - replicate_c = c; - } - } + /* Figure out which component we should fill unused channels with. This + * doesn't matter too much in the non-broadcast case, but it makes + * should that scalar sources are packed with replicated swizzles, + * which works around issues seen with the combination of source + * expansion and destination shrinking. + */ + unsigned replicate_c = 0; + if (bcast_count) { + replicate_c = bcast_count - 1; + } else { + for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) { + if (nir_alu_instr_channel_used(instr, i, c)) + replicate_c = c; + } + } - for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) { - ins->swizzle[to][c] = src.swizzle[ - ((!bcast_count || c < bcast_count) && - nir_alu_instr_channel_used(instr, i, c)) ? - c : replicate_c]; - } + for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) { + ins->swizzle[to][c] = + src.swizzle[((!bcast_count || c < bcast_count) && + nir_alu_instr_channel_used(instr, i, c)) + ? c + : replicate_c]; + } } /* Midgard features both fcsel and icsel, depending on whether you want int or @@ -699,480 +706,474 @@ mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigne static bool mir_is_bcsel_float(nir_alu_instr *instr) { - nir_op intmods[] = { - nir_op_i2i8, nir_op_i2i16, - nir_op_i2i32, nir_op_i2i64 - }; + nir_op intmods[] = {nir_op_i2i8, nir_op_i2i16, nir_op_i2i32, nir_op_i2i64}; - nir_op floatmods[] = { - nir_op_fabs, nir_op_fneg, - nir_op_f2f16, nir_op_f2f32, - nir_op_f2f64 - }; + nir_op floatmods[] = {nir_op_fabs, nir_op_fneg, nir_op_f2f16, nir_op_f2f32, + nir_op_f2f64}; - nir_op floatdestmods[] = { - nir_op_fsat, nir_op_fsat_signed_mali, nir_op_fclamp_pos_mali, - nir_op_f2f16, nir_op_f2f32 - }; + nir_op floatdestmods[] = {nir_op_fsat, nir_op_fsat_signed_mali, + nir_op_fclamp_pos_mali, nir_op_f2f16, + nir_op_f2f32}; - signed score = 0; + signed score = 0; - for (unsigned i = 1; i < 3; ++i) { - nir_alu_src s = instr->src[i]; - for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) { - if (pan_has_source_mod(&s, intmods[q])) - score--; - } - } + for (unsigned i = 1; i < 3; ++i) { + nir_alu_src s = instr->src[i]; + for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) { + if (pan_has_source_mod(&s, intmods[q])) + score--; + } + } - for (unsigned i = 1; i < 3; ++i) { - nir_alu_src s = instr->src[i]; - for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) { - if (pan_has_source_mod(&s, floatmods[q])) - score++; - } - } + for (unsigned i = 1; i < 3; ++i) { + nir_alu_src s = instr->src[i]; + for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) { + if (pan_has_source_mod(&s, floatmods[q])) + score++; + } + } - for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) { - nir_dest *dest = &instr->dest.dest; - if (pan_has_dest_mod(&dest, floatdestmods[q])) - score++; - } + for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) { + nir_dest *dest = &instr->dest.dest; + if (pan_has_dest_mod(&dest, floatdestmods[q])) + score++; + } - return (score > 0); + return (score > 0); } static void emit_alu(compiler_context *ctx, nir_alu_instr *instr) { - nir_dest *dest = &instr->dest.dest; + nir_dest *dest = &instr->dest.dest; - if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index)) - return; + if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index)) + return; - /* Derivatives end up emitted on the texture pipe, not the ALUs. This - * is handled elsewhere */ + /* Derivatives end up emitted on the texture pipe, not the ALUs. This + * is handled elsewhere */ - if (instr->op == nir_op_fddx || instr->op == nir_op_fddy) { - midgard_emit_derivatives(ctx, instr); - return; - } + if (instr->op == nir_op_fddx || instr->op == nir_op_fddy) { + midgard_emit_derivatives(ctx, instr); + return; + } - bool is_ssa = dest->is_ssa; + bool is_ssa = dest->is_ssa; - unsigned nr_components = nir_dest_num_components(*dest); - unsigned nr_inputs = nir_op_infos[instr->op].num_inputs; - unsigned op = 0; + unsigned nr_components = nir_dest_num_components(*dest); + unsigned nr_inputs = nir_op_infos[instr->op].num_inputs; + unsigned op = 0; - /* Number of components valid to check for the instruction (the rest - * will be forced to the last), or 0 to use as-is. Relevant as - * ball-type instructions have a channel count in NIR but are all vec4 - * in Midgard */ + /* Number of components valid to check for the instruction (the rest + * will be forced to the last), or 0 to use as-is. Relevant as + * ball-type instructions have a channel count in NIR but are all vec4 + * in Midgard */ - unsigned broadcast_swizzle = 0; + unsigned broadcast_swizzle = 0; - /* Should we swap arguments? */ - bool flip_src12 = false; + /* Should we swap arguments? */ + bool flip_src12 = false; - ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); - ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest); + ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); + ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest); - enum midgard_roundmode roundmode = MIDGARD_RTE; + enum midgard_roundmode roundmode = MIDGARD_RTE; - switch (instr->op) { - ALU_CASE(fadd, fadd); - ALU_CASE(fmul, fmul); - ALU_CASE(fmin, fmin); - ALU_CASE(fmax, fmax); - ALU_CASE(imin, imin); - ALU_CASE(imax, imax); - ALU_CASE(umin, umin); - ALU_CASE(umax, umax); - ALU_CASE(ffloor, ffloor); - ALU_CASE(fround_even, froundeven); - ALU_CASE(ftrunc, ftrunc); - ALU_CASE(fceil, fceil); - ALU_CASE(fdot3, fdot3); - ALU_CASE(fdot4, fdot4); - ALU_CASE(iadd, iadd); - ALU_CASE(isub, isub); - ALU_CASE(iadd_sat, iaddsat); - ALU_CASE(isub_sat, isubsat); - ALU_CASE(uadd_sat, uaddsat); - ALU_CASE(usub_sat, usubsat); - ALU_CASE(imul, imul); - ALU_CASE(imul_high, imul); - ALU_CASE(umul_high, imul); - ALU_CASE(uclz, iclz); + switch (instr->op) { + ALU_CASE(fadd, fadd); + ALU_CASE(fmul, fmul); + ALU_CASE(fmin, fmin); + ALU_CASE(fmax, fmax); + ALU_CASE(imin, imin); + ALU_CASE(imax, imax); + ALU_CASE(umin, umin); + ALU_CASE(umax, umax); + ALU_CASE(ffloor, ffloor); + ALU_CASE(fround_even, froundeven); + ALU_CASE(ftrunc, ftrunc); + ALU_CASE(fceil, fceil); + ALU_CASE(fdot3, fdot3); + ALU_CASE(fdot4, fdot4); + ALU_CASE(iadd, iadd); + ALU_CASE(isub, isub); + ALU_CASE(iadd_sat, iaddsat); + ALU_CASE(isub_sat, isubsat); + ALU_CASE(uadd_sat, uaddsat); + ALU_CASE(usub_sat, usubsat); + ALU_CASE(imul, imul); + ALU_CASE(imul_high, imul); + ALU_CASE(umul_high, imul); + ALU_CASE(uclz, iclz); - /* Zero shoved as second-arg */ - ALU_CASE(iabs, iabsdiff); + /* Zero shoved as second-arg */ + ALU_CASE(iabs, iabsdiff); - ALU_CASE(uabs_isub, iabsdiff); - ALU_CASE(uabs_usub, uabsdiff); + ALU_CASE(uabs_isub, iabsdiff); + ALU_CASE(uabs_usub, uabsdiff); - ALU_CASE(mov, imov); + ALU_CASE(mov, imov); - ALU_CASE_CMP(feq32, feq); - ALU_CASE_CMP(fneu32, fne); - ALU_CASE_CMP(flt32, flt); - ALU_CASE_CMP(ieq32, ieq); - ALU_CASE_CMP(ine32, ine); - ALU_CASE_CMP(ilt32, ilt); - ALU_CASE_CMP(ult32, ult); + ALU_CASE_CMP(feq32, feq); + ALU_CASE_CMP(fneu32, fne); + ALU_CASE_CMP(flt32, flt); + ALU_CASE_CMP(ieq32, ieq); + ALU_CASE_CMP(ine32, ine); + ALU_CASE_CMP(ilt32, ilt); + ALU_CASE_CMP(ult32, ult); - /* We don't have a native b2f32 instruction. Instead, like many - * GPUs, we exploit booleans as 0/~0 for false/true, and - * correspondingly AND - * by 1.0 to do the type conversion. For the moment, prime us - * to emit: - * - * iand [whatever], #0 - * - * At the end of emit_alu (as MIR), we'll fix-up the constant - */ + /* We don't have a native b2f32 instruction. Instead, like many + * GPUs, we exploit booleans as 0/~0 for false/true, and + * correspondingly AND + * by 1.0 to do the type conversion. For the moment, prime us + * to emit: + * + * iand [whatever], #0 + * + * At the end of emit_alu (as MIR), we'll fix-up the constant + */ - ALU_CASE_CMP(b2f32, iand); - ALU_CASE_CMP(b2f16, iand); - ALU_CASE_CMP(b2i32, iand); + ALU_CASE_CMP(b2f32, iand); + ALU_CASE_CMP(b2f16, iand); + ALU_CASE_CMP(b2i32, iand); - /* Likewise, we don't have a dedicated f2b32 instruction, but - * we can do a "not equal to 0.0" test. */ + /* Likewise, we don't have a dedicated f2b32 instruction, but + * we can do a "not equal to 0.0" test. */ - ALU_CASE_CMP(f2b32, fne); + ALU_CASE_CMP(f2b32, fne); - ALU_CASE(frcp, frcp); - ALU_CASE(frsq, frsqrt); - ALU_CASE(fsqrt, fsqrt); - ALU_CASE(fexp2, fexp2); - ALU_CASE(flog2, flog2); + ALU_CASE(frcp, frcp); + ALU_CASE(frsq, frsqrt); + ALU_CASE(fsqrt, fsqrt); + ALU_CASE(fexp2, fexp2); + ALU_CASE(flog2, flog2); - ALU_CASE_RTZ(f2i64, f2i_rte); - ALU_CASE_RTZ(f2u64, f2u_rte); - ALU_CASE_RTZ(i2f64, i2f_rte); - ALU_CASE_RTZ(u2f64, u2f_rte); + ALU_CASE_RTZ(f2i64, f2i_rte); + ALU_CASE_RTZ(f2u64, f2u_rte); + ALU_CASE_RTZ(i2f64, i2f_rte); + ALU_CASE_RTZ(u2f64, u2f_rte); - ALU_CASE_RTZ(f2i32, f2i_rte); - ALU_CASE_RTZ(f2u32, f2u_rte); - ALU_CASE_RTZ(i2f32, i2f_rte); - ALU_CASE_RTZ(u2f32, u2f_rte); + ALU_CASE_RTZ(f2i32, f2i_rte); + ALU_CASE_RTZ(f2u32, f2u_rte); + ALU_CASE_RTZ(i2f32, i2f_rte); + ALU_CASE_RTZ(u2f32, u2f_rte); - ALU_CASE_RTZ(f2i8, f2i_rte); - ALU_CASE_RTZ(f2u8, f2u_rte); + ALU_CASE_RTZ(f2i8, f2i_rte); + ALU_CASE_RTZ(f2u8, f2u_rte); - ALU_CASE_RTZ(f2i16, f2i_rte); - ALU_CASE_RTZ(f2u16, f2u_rte); - ALU_CASE_RTZ(i2f16, i2f_rte); - ALU_CASE_RTZ(u2f16, u2f_rte); + ALU_CASE_RTZ(f2i16, f2i_rte); + ALU_CASE_RTZ(f2u16, f2u_rte); + ALU_CASE_RTZ(i2f16, i2f_rte); + ALU_CASE_RTZ(u2f16, u2f_rte); - ALU_CASE(fsin, fsinpi); - ALU_CASE(fcos, fcospi); + ALU_CASE(fsin, fsinpi); + ALU_CASE(fcos, fcospi); - /* We'll get 0 in the second arg, so: - * ~a = ~(a | 0) = nor(a, 0) */ - ALU_CASE(inot, inor); - ALU_CASE(iand, iand); - ALU_CASE(ior, ior); - ALU_CASE(ixor, ixor); - ALU_CASE(ishl, ishl); - ALU_CASE(ishr, iasr); - ALU_CASE(ushr, ilsr); + /* We'll get 0 in the second arg, so: + * ~a = ~(a | 0) = nor(a, 0) */ + ALU_CASE(inot, inor); + ALU_CASE(iand, iand); + ALU_CASE(ior, ior); + ALU_CASE(ixor, ixor); + ALU_CASE(ishl, ishl); + ALU_CASE(ishr, iasr); + ALU_CASE(ushr, ilsr); - ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2); - ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3); - ALU_CASE_CMP(b32all_fequal4, fball_eq); + ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2); + ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3); + ALU_CASE_CMP(b32all_fequal4, fball_eq); - ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2); - ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3); - ALU_CASE_CMP(b32any_fnequal4, fbany_neq); + ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2); + ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3); + ALU_CASE_CMP(b32any_fnequal4, fbany_neq); - ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2); - ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3); - ALU_CASE_CMP(b32all_iequal4, iball_eq); + ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2); + ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3); + ALU_CASE_CMP(b32all_iequal4, iball_eq); - ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2); - ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3); - ALU_CASE_CMP(b32any_inequal4, ibany_neq); + ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2); + ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3); + ALU_CASE_CMP(b32any_inequal4, ibany_neq); - /* Source mods will be shoved in later */ - ALU_CASE(fabs, fmov); - ALU_CASE(fneg, fmov); - ALU_CASE(fsat, fmov); - ALU_CASE(fsat_signed_mali, fmov); - ALU_CASE(fclamp_pos_mali, fmov); + /* Source mods will be shoved in later */ + ALU_CASE(fabs, fmov); + ALU_CASE(fneg, fmov); + ALU_CASE(fsat, fmov); + ALU_CASE(fsat_signed_mali, fmov); + ALU_CASE(fclamp_pos_mali, fmov); - /* For size conversion, we use a move. Ideally though we would squash - * these ops together; maybe that has to happen after in NIR as part of - * propagation...? An earlier algebraic pass ensured we step down by - * only / exactly one size. If stepping down, we use a dest override to - * reduce the size; if stepping up, we use a larger-sized move with a - * half source and a sign/zero-extension modifier */ + /* For size conversion, we use a move. Ideally though we would squash + * these ops together; maybe that has to happen after in NIR as part of + * propagation...? An earlier algebraic pass ensured we step down by + * only / exactly one size. If stepping down, we use a dest override to + * reduce the size; if stepping up, we use a larger-sized move with a + * half source and a sign/zero-extension modifier */ - case nir_op_i2i8: - case nir_op_i2i16: - case nir_op_i2i32: - case nir_op_i2i64: - case nir_op_u2u8: - case nir_op_u2u16: - case nir_op_u2u32: - case nir_op_u2u64: - case nir_op_f2f16: - case nir_op_f2f32: - case nir_op_f2f64: { - if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 || - instr->op == nir_op_f2f64) - op = midgard_alu_op_fmov; - else - op = midgard_alu_op_imov; + case nir_op_i2i8: + case nir_op_i2i16: + case nir_op_i2i32: + case nir_op_i2i64: + case nir_op_u2u8: + case nir_op_u2u16: + case nir_op_u2u32: + case nir_op_u2u64: + case nir_op_f2f16: + case nir_op_f2f32: + case nir_op_f2f64: { + if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 || + instr->op == nir_op_f2f64) + op = midgard_alu_op_fmov; + else + op = midgard_alu_op_imov; - break; - } + break; + } - /* For greater-or-equal, we lower to less-or-equal and flip the - * arguments */ + /* For greater-or-equal, we lower to less-or-equal and flip the + * arguments */ - case nir_op_fge: - case nir_op_fge32: - case nir_op_ige32: - case nir_op_uge32: { - op = - instr->op == nir_op_fge ? midgard_alu_op_fle : - instr->op == nir_op_fge32 ? midgard_alu_op_fle : - instr->op == nir_op_ige32 ? midgard_alu_op_ile : - instr->op == nir_op_uge32 ? midgard_alu_op_ule : - 0; + case nir_op_fge: + case nir_op_fge32: + case nir_op_ige32: + case nir_op_uge32: { + op = instr->op == nir_op_fge ? midgard_alu_op_fle + : instr->op == nir_op_fge32 ? midgard_alu_op_fle + : instr->op == nir_op_ige32 ? midgard_alu_op_ile + : instr->op == nir_op_uge32 ? midgard_alu_op_ule + : 0; - flip_src12 = true; - ALU_CHECK_CMP(); - break; - } + flip_src12 = true; + ALU_CHECK_CMP(); + break; + } - case nir_op_b32csel: { - bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); - bool is_float = mir_is_bcsel_float(instr); - op = is_float ? - (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) : - (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel); + case nir_op_b32csel: { + bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); + bool is_float = mir_is_bcsel_float(instr); + op = is_float ? (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) + : (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel); - break; - } + break; + } - case nir_op_unpack_32_2x16: - case nir_op_unpack_32_4x8: - case nir_op_pack_32_2x16: - case nir_op_pack_32_4x8: { - op = midgard_alu_op_imov; - break; - } + case nir_op_unpack_32_2x16: + case nir_op_unpack_32_4x8: + case nir_op_pack_32_2x16: + case nir_op_pack_32_4x8: { + op = midgard_alu_op_imov; + break; + } - default: - mesa_loge("Unhandled ALU op %s\n", nir_op_infos[instr->op].name); - assert(0); - return; - } + default: + mesa_loge("Unhandled ALU op %s\n", nir_op_infos[instr->op].name); + assert(0); + return; + } - /* Promote imov to fmov if it might help inline a constant */ - if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src) - && nir_src_bit_size(instr->src[0].src) == 32 - && nir_is_same_comp_swizzle(instr->src[0].swizzle, + /* Promote imov to fmov if it might help inline a constant */ + if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src) && + nir_src_bit_size(instr->src[0].src) == 32 && + nir_is_same_comp_swizzle(instr->src[0].swizzle, nir_src_num_components(instr->src[0].src))) { - op = midgard_alu_op_fmov; - } + op = midgard_alu_op_fmov; + } - /* Midgard can perform certain modifiers on output of an ALU op */ + /* Midgard can perform certain modifiers on output of an ALU op */ - unsigned outmod = 0; - bool is_int = midgard_is_integer_op(op); + unsigned outmod = 0; + bool is_int = midgard_is_integer_op(op); - if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) { - outmod = midgard_outmod_keephi; - } else if (midgard_is_integer_out_op(op)) { - outmod = midgard_outmod_keeplo; - } else if (instr->op == nir_op_fsat) { - outmod = midgard_outmod_clamp_0_1; - } else if (instr->op == nir_op_fsat_signed_mali) { - outmod = midgard_outmod_clamp_m1_1; - } else if (instr->op == nir_op_fclamp_pos_mali) { - outmod = midgard_outmod_clamp_0_inf; - } + if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) { + outmod = midgard_outmod_keephi; + } else if (midgard_is_integer_out_op(op)) { + outmod = midgard_outmod_keeplo; + } else if (instr->op == nir_op_fsat) { + outmod = midgard_outmod_clamp_0_1; + } else if (instr->op == nir_op_fsat_signed_mali) { + outmod = midgard_outmod_clamp_m1_1; + } else if (instr->op == nir_op_fclamp_pos_mali) { + outmod = midgard_outmod_clamp_0_inf; + } - /* Fetch unit, quirks, etc information */ - unsigned opcode_props = alu_opcode_props[op].props; - bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; + /* Fetch unit, quirks, etc information */ + unsigned opcode_props = alu_opcode_props[op].props; + bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; - if (!midgard_is_integer_out_op(op)) { - outmod = mir_determine_float_outmod(ctx, &dest, outmod); - } + if (!midgard_is_integer_out_op(op)) { + outmod = mir_determine_float_outmod(ctx, &dest, outmod); + } - midgard_instruction ins = { - .type = TAG_ALU_4, - .dest = nir_dest_index(dest), - .dest_type = nir_op_infos[instr->op].output_type - | nir_dest_bit_size(*dest), - .roundmode = roundmode, - }; + midgard_instruction ins = { + .type = TAG_ALU_4, + .dest = nir_dest_index(dest), + .dest_type = + nir_op_infos[instr->op].output_type | nir_dest_bit_size(*dest), + .roundmode = roundmode, + }; - enum midgard_roundmode *roundptr = (opcode_props & MIDGARD_ROUNDS) ? - &ins.roundmode : NULL; + enum midgard_roundmode *roundptr = + (opcode_props & MIDGARD_ROUNDS) ? &ins.roundmode : NULL; - for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i) - ins.src[i] = ~0; + for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i) + ins.src[i] = ~0; - if (quirk_flipped_r24) { - ins.src[0] = ~0; - mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], roundptr, is_int, broadcast_swizzle); - } else { - for (unsigned i = 0; i < nr_inputs; ++i) { - unsigned to = i; + if (quirk_flipped_r24) { + ins.src[0] = ~0; + mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], + &ins.src_invert[1], roundptr, is_int, broadcast_swizzle); + } else { + for (unsigned i = 0; i < nr_inputs; ++i) { + unsigned to = i; - if (instr->op == nir_op_b32csel) { - /* The condition is the first argument; move - * the other arguments up one to be a binary - * instruction for Midgard with the condition - * last */ + if (instr->op == nir_op_b32csel) { + /* The condition is the first argument; move + * the other arguments up one to be a binary + * instruction for Midgard with the condition + * last */ - if (i == 0) - to = 2; - else if (flip_src12) - to = 2 - i; - else - to = i - 1; - } else if (flip_src12) { - to = 1 - to; - } + if (i == 0) + to = 2; + else if (flip_src12) + to = 2 - i; + else + to = i - 1; + } else if (flip_src12) { + to = 1 - to; + } - mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], roundptr, is_int, broadcast_swizzle); + mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], + &ins.src_invert[to], roundptr, is_int, broadcast_swizzle); - /* (!c) ? a : b = c ? b : a */ - if (instr->op == nir_op_b32csel && ins.src_invert[2]) { - ins.src_invert[2] = false; - flip_src12 ^= true; - } - } - } + /* (!c) ? a : b = c ? b : a */ + if (instr->op == nir_op_b32csel && ins.src_invert[2]) { + ins.src_invert[2] = false; + flip_src12 ^= true; + } + } + } - if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { - /* Lowered to move */ - if (instr->op == nir_op_fneg) - ins.src_neg[1] ^= true; + if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { + /* Lowered to move */ + if (instr->op == nir_op_fneg) + ins.src_neg[1] ^= true; - if (instr->op == nir_op_fabs) - ins.src_abs[1] = true; - } + if (instr->op == nir_op_fabs) + ins.src_abs[1] = true; + } - ins.mask = mask_of(nr_components); + ins.mask = mask_of(nr_components); - /* Apply writemask if non-SSA, keeping in mind that we can't write to - * components that don't exist. Note modifier => SSA => !reg => no - * writemask, so we don't have to worry about writemasks here.*/ + /* Apply writemask if non-SSA, keeping in mind that we can't write to + * components that don't exist. Note modifier => SSA => !reg => no + * writemask, so we don't have to worry about writemasks here.*/ - if (!is_ssa) - ins.mask &= instr->dest.write_mask; + if (!is_ssa) + ins.mask &= instr->dest.write_mask; - ins.op = op; - ins.outmod = outmod; + ins.op = op; + ins.outmod = outmod; - /* Late fixup for emulated instructions */ + /* Late fixup for emulated instructions */ - if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) { - /* Presently, our second argument is an inline #0 constant. - * Switch over to an embedded 1.0 constant (that can't fit - * inline, since we're 32-bit, not 16-bit like the inline - * constants) */ + if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) { + /* Presently, our second argument is an inline #0 constant. + * Switch over to an embedded 1.0 constant (that can't fit + * inline, since we're 32-bit, not 16-bit like the inline + * constants) */ - ins.has_inline_constant = false; - ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); - ins.src_types[1] = nir_type_float32; - ins.has_constants = true; + ins.has_inline_constant = false; + ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.src_types[1] = nir_type_float32; + ins.has_constants = true; - if (instr->op == nir_op_b2f32) - ins.constants.f32[0] = 1.0f; - else - ins.constants.i32[0] = 1; + if (instr->op == nir_op_b2f32) + ins.constants.f32[0] = 1.0f; + else + ins.constants.i32[0] = 1; - for (unsigned c = 0; c < 16; ++c) - ins.swizzle[1][c] = 0; - } else if (instr->op == nir_op_b2f16) { - ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); - ins.src_types[1] = nir_type_float16; - ins.has_constants = true; - ins.constants.i16[0] = _mesa_float_to_half(1.0); + for (unsigned c = 0; c < 16; ++c) + ins.swizzle[1][c] = 0; + } else if (instr->op == nir_op_b2f16) { + ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.src_types[1] = nir_type_float16; + ins.has_constants = true; + ins.constants.i16[0] = _mesa_float_to_half(1.0); - for (unsigned c = 0; c < 16; ++c) - ins.swizzle[1][c] = 0; - } else if (nr_inputs == 1 && !quirk_flipped_r24) { - /* Lots of instructions need a 0 plonked in */ - ins.has_inline_constant = false; - ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); - ins.src_types[1] = ins.src_types[0]; - ins.has_constants = true; - ins.constants.u32[0] = 0; + for (unsigned c = 0; c < 16; ++c) + ins.swizzle[1][c] = 0; + } else if (nr_inputs == 1 && !quirk_flipped_r24) { + /* Lots of instructions need a 0 plonked in */ + ins.has_inline_constant = false; + ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.src_types[1] = ins.src_types[0]; + ins.has_constants = true; + ins.constants.u32[0] = 0; - for (unsigned c = 0; c < 16; ++c) - ins.swizzle[1][c] = 0; - } else if (instr->op == nir_op_pack_32_2x16) { - ins.dest_type = nir_type_uint16; - ins.mask = mask_of(nr_components * 2); - ins.is_pack = true; - } else if (instr->op == nir_op_pack_32_4x8) { - ins.dest_type = nir_type_uint8; - ins.mask = mask_of(nr_components * 4); - ins.is_pack = true; - } else if (instr->op == nir_op_unpack_32_2x16) { - ins.dest_type = nir_type_uint32; - ins.mask = mask_of(nr_components >> 1); - ins.is_pack = true; - } else if (instr->op == nir_op_unpack_32_4x8) { - ins.dest_type = nir_type_uint32; - ins.mask = mask_of(nr_components >> 2); - ins.is_pack = true; - } + for (unsigned c = 0; c < 16; ++c) + ins.swizzle[1][c] = 0; + } else if (instr->op == nir_op_pack_32_2x16) { + ins.dest_type = nir_type_uint16; + ins.mask = mask_of(nr_components * 2); + ins.is_pack = true; + } else if (instr->op == nir_op_pack_32_4x8) { + ins.dest_type = nir_type_uint8; + ins.mask = mask_of(nr_components * 4); + ins.is_pack = true; + } else if (instr->op == nir_op_unpack_32_2x16) { + ins.dest_type = nir_type_uint32; + ins.mask = mask_of(nr_components >> 1); + ins.is_pack = true; + } else if (instr->op == nir_op_unpack_32_4x8) { + ins.dest_type = nir_type_uint32; + ins.mask = mask_of(nr_components >> 2); + ins.is_pack = true; + } - if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { - /* To avoid duplicating the lookup tables (probably), true LUT - * instructions can only operate as if they were scalars. Lower - * them here by changing the component. */ + if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { + /* To avoid duplicating the lookup tables (probably), true LUT + * instructions can only operate as if they were scalars. Lower + * them here by changing the component. */ - unsigned orig_mask = ins.mask; + unsigned orig_mask = ins.mask; - unsigned swizzle_back[MIR_VEC_COMPONENTS]; - memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back)); + unsigned swizzle_back[MIR_VEC_COMPONENTS]; + memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back)); - midgard_instruction ins_split[MIR_VEC_COMPONENTS]; - unsigned ins_count = 0; + midgard_instruction ins_split[MIR_VEC_COMPONENTS]; + unsigned ins_count = 0; - for (int i = 0; i < nr_components; ++i) { - /* Mask the associated component, dropping the - * instruction if needed */ + for (int i = 0; i < nr_components; ++i) { + /* Mask the associated component, dropping the + * instruction if needed */ - ins.mask = 1 << i; - ins.mask &= orig_mask; + ins.mask = 1 << i; + ins.mask &= orig_mask; - for (unsigned j = 0; j < ins_count; ++j) { - if (swizzle_back[i] == ins_split[j].swizzle[0][0]) { - ins_split[j].mask |= ins.mask; - ins.mask = 0; - break; - } - } + for (unsigned j = 0; j < ins_count; ++j) { + if (swizzle_back[i] == ins_split[j].swizzle[0][0]) { + ins_split[j].mask |= ins.mask; + ins.mask = 0; + break; + } + } - if (!ins.mask) - continue; + if (!ins.mask) + continue; - for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j) - ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */ + for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j) + ins.swizzle[0][j] = + swizzle_back[i]; /* Pull from the correct component */ - ins_split[ins_count] = ins; + ins_split[ins_count] = ins; - ++ins_count; - } + ++ins_count; + } - for (unsigned i = 0; i < ins_count; ++i) { - emit_mir_instruction(ctx, ins_split[i]); - } - } else { - emit_mir_instruction(ctx, ins); - } + for (unsigned i = 0; i < ins_count; ++i) { + emit_mir_instruction(ctx, ins_split[i]); + } + } else { + emit_mir_instruction(ctx, ins); + } } #undef ALU_CASE @@ -1180,179 +1181,179 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) static void mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read) { - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - unsigned nir_mask = 0; - unsigned dsize = 0; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + unsigned nir_mask = 0; + unsigned dsize = 0; - if (is_read) { - nir_mask = mask_of(nir_intrinsic_dest_components(intr)); + if (is_read) { + nir_mask = mask_of(nir_intrinsic_dest_components(intr)); - /* Extension is mandatory for 8/16-bit loads */ - dsize = nir_dest_bit_size(intr->dest) == 64 ? 64 : 32; - } else { - nir_mask = nir_intrinsic_write_mask(intr); - dsize = OP_IS_COMMON_STORE(ins->op) ? - nir_src_bit_size(intr->src[0]) : 32; - } + /* Extension is mandatory for 8/16-bit loads */ + dsize = nir_dest_bit_size(intr->dest) == 64 ? 64 : 32; + } else { + nir_mask = nir_intrinsic_write_mask(intr); + dsize = OP_IS_COMMON_STORE(ins->op) ? nir_src_bit_size(intr->src[0]) : 32; + } - /* Once we have the NIR mask, we need to normalize to work in 32-bit space */ - unsigned bytemask = pan_to_bytemask(dsize, nir_mask); - ins->dest_type = nir_type_uint | dsize; - mir_set_bytemask(ins, bytemask); + /* Once we have the NIR mask, we need to normalize to work in 32-bit space */ + unsigned bytemask = pan_to_bytemask(dsize, nir_mask); + ins->dest_type = nir_type_uint | dsize; + mir_set_bytemask(ins, bytemask); } /* Uniforms and UBOs use a shared code path, as uniforms are just (slightly * optimized) versions of UBO #0 */ static midgard_instruction * -emit_ubo_read( - compiler_context *ctx, - nir_instr *instr, - unsigned dest, - unsigned offset, - nir_src *indirect_offset, - unsigned indirect_shift, - unsigned index, - unsigned nr_comps) +emit_ubo_read(compiler_context *ctx, nir_instr *instr, unsigned dest, + unsigned offset, nir_src *indirect_offset, + unsigned indirect_shift, unsigned index, unsigned nr_comps) { - midgard_instruction ins; + midgard_instruction ins; - unsigned dest_size = (instr->type == nir_instr_type_intrinsic) ? - nir_dest_bit_size(nir_instr_as_intrinsic(instr)->dest) : 32; + unsigned dest_size = + (instr->type == nir_instr_type_intrinsic) + ? nir_dest_bit_size(nir_instr_as_intrinsic(instr)->dest) + : 32; - unsigned bitsize = dest_size * nr_comps; + unsigned bitsize = dest_size * nr_comps; - /* Pick the smallest intrinsic to avoid out-of-bounds reads */ - if (bitsize <= 8) - ins = m_ld_ubo_u8(dest, 0); - else if (bitsize <= 16) - ins = m_ld_ubo_u16(dest, 0); - else if (bitsize <= 32) - ins = m_ld_ubo_32(dest, 0); - else if (bitsize <= 64) - ins = m_ld_ubo_64(dest, 0); - else if (bitsize <= 128) - ins = m_ld_ubo_128(dest, 0); - else - unreachable("Invalid UBO read size"); + /* Pick the smallest intrinsic to avoid out-of-bounds reads */ + if (bitsize <= 8) + ins = m_ld_ubo_u8(dest, 0); + else if (bitsize <= 16) + ins = m_ld_ubo_u16(dest, 0); + else if (bitsize <= 32) + ins = m_ld_ubo_32(dest, 0); + else if (bitsize <= 64) + ins = m_ld_ubo_64(dest, 0); + else if (bitsize <= 128) + ins = m_ld_ubo_128(dest, 0); + else + unreachable("Invalid UBO read size"); - ins.constants.u32[0] = offset; + ins.constants.u32[0] = offset; - if (instr->type == nir_instr_type_intrinsic) - mir_set_intr_mask(instr, &ins, true); + if (instr->type == nir_instr_type_intrinsic) + mir_set_intr_mask(instr, &ins, true); - if (indirect_offset) { - ins.src[2] = nir_src_index(ctx, indirect_offset); - ins.src_types[2] = nir_type_uint32; - ins.load_store.index_shift = indirect_shift; + if (indirect_offset) { + ins.src[2] = nir_src_index(ctx, indirect_offset); + ins.src_types[2] = nir_type_uint32; + ins.load_store.index_shift = indirect_shift; - /* X component for the whole swizzle to prevent register - * pressure from ballooning from the extra components */ - for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i) - ins.swizzle[2][i] = 0; - } else { - ins.load_store.index_reg = REGISTER_LDST_ZERO; - } + /* X component for the whole swizzle to prevent register + * pressure from ballooning from the extra components */ + for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i) + ins.swizzle[2][i] = 0; + } else { + ins.load_store.index_reg = REGISTER_LDST_ZERO; + } - if (indirect_offset && indirect_offset->is_ssa && !indirect_shift) - mir_set_ubo_offset(&ins, indirect_offset, offset); + if (indirect_offset && indirect_offset->is_ssa && !indirect_shift) + mir_set_ubo_offset(&ins, indirect_offset, offset); - midgard_pack_ubo_index_imm(&ins.load_store, index); + midgard_pack_ubo_index_imm(&ins.load_store, index); - return emit_mir_instruction(ctx, ins); + return emit_mir_instruction(ctx, ins); } /* Globals are like UBOs if you squint. And shared memory is like globals if * you squint even harder */ static void -emit_global( - compiler_context *ctx, - nir_instr *instr, - bool is_read, - unsigned srcdest, - nir_src *offset, - unsigned seg) +emit_global(compiler_context *ctx, nir_instr *instr, bool is_read, + unsigned srcdest, nir_src *offset, unsigned seg) { - midgard_instruction ins; + midgard_instruction ins; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (is_read) { - unsigned bitsize = nir_dest_bit_size(intr->dest) * - nir_dest_num_components(intr->dest); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (is_read) { + unsigned bitsize = + nir_dest_bit_size(intr->dest) * nir_dest_num_components(intr->dest); - switch (bitsize) { - case 8: ins = m_ld_u8(srcdest, 0); break; - case 16: ins = m_ld_u16(srcdest, 0); break; - case 32: ins = m_ld_32(srcdest, 0); break; - case 64: ins = m_ld_64(srcdest, 0); break; - case 128: ins = m_ld_128(srcdest, 0); break; - default: unreachable("Invalid global read size"); - } + switch (bitsize) { + case 8: + ins = m_ld_u8(srcdest, 0); + break; + case 16: + ins = m_ld_u16(srcdest, 0); + break; + case 32: + ins = m_ld_32(srcdest, 0); + break; + case 64: + ins = m_ld_64(srcdest, 0); + break; + case 128: + ins = m_ld_128(srcdest, 0); + break; + default: + unreachable("Invalid global read size"); + } - mir_set_intr_mask(instr, &ins, is_read); + mir_set_intr_mask(instr, &ins, is_read); - /* For anything not aligned on 32bit, make sure we write full - * 32 bits registers. */ - if (bitsize & 31) { - unsigned comps_per_32b = 32 / nir_dest_bit_size(intr->dest); + /* For anything not aligned on 32bit, make sure we write full + * 32 bits registers. */ + if (bitsize & 31) { + unsigned comps_per_32b = 32 / nir_dest_bit_size(intr->dest); - for (unsigned c = 0; c < 4 * comps_per_32b; c += comps_per_32b) { - if (!(ins.mask & BITFIELD_RANGE(c, comps_per_32b))) - continue; + for (unsigned c = 0; c < 4 * comps_per_32b; c += comps_per_32b) { + if (!(ins.mask & BITFIELD_RANGE(c, comps_per_32b))) + continue; - unsigned base = ~0; - for (unsigned i = 0; i < comps_per_32b; i++) { - if (ins.mask & BITFIELD_BIT(c + i)) { - base = ins.swizzle[0][c + i]; - break; - } - } + unsigned base = ~0; + for (unsigned i = 0; i < comps_per_32b; i++) { + if (ins.mask & BITFIELD_BIT(c + i)) { + base = ins.swizzle[0][c + i]; + break; + } + } - assert(base != ~0); + assert(base != ~0); - for (unsigned i = 0; i < comps_per_32b; i++) { - if (!(ins.mask & BITFIELD_BIT(c + i))) { - ins.swizzle[0][c + i] = base + i; - ins.mask |= BITFIELD_BIT(c + i); - } - assert(ins.swizzle[0][c + i] == base + i); - } - } + for (unsigned i = 0; i < comps_per_32b; i++) { + if (!(ins.mask & BITFIELD_BIT(c + i))) { + ins.swizzle[0][c + i] = base + i; + ins.mask |= BITFIELD_BIT(c + i); + } + assert(ins.swizzle[0][c + i] == base + i); + } + } + } + } else { + unsigned bitsize = + nir_src_bit_size(intr->src[0]) * nir_src_num_components(intr->src[0]); - } - } else { - unsigned bitsize = nir_src_bit_size(intr->src[0]) * - nir_src_num_components(intr->src[0]); + if (bitsize == 8) + ins = m_st_u8(srcdest, 0); + else if (bitsize == 16) + ins = m_st_u16(srcdest, 0); + else if (bitsize <= 32) + ins = m_st_32(srcdest, 0); + else if (bitsize <= 64) + ins = m_st_64(srcdest, 0); + else if (bitsize <= 128) + ins = m_st_128(srcdest, 0); + else + unreachable("Invalid global store size"); - if (bitsize == 8) - ins = m_st_u8(srcdest, 0); - else if (bitsize == 16) - ins = m_st_u16(srcdest, 0); - else if (bitsize <= 32) - ins = m_st_32(srcdest, 0); - else if (bitsize <= 64) - ins = m_st_64(srcdest, 0); - else if (bitsize <= 128) - ins = m_st_128(srcdest, 0); - else - unreachable("Invalid global store size"); + mir_set_intr_mask(instr, &ins, is_read); + } - mir_set_intr_mask(instr, &ins, is_read); - } + mir_set_offset(ctx, &ins, offset, seg); - mir_set_offset(ctx, &ins, offset, seg); + /* Set a valid swizzle for masked out components */ + assert(ins.mask); + unsigned first_component = __builtin_ffs(ins.mask) - 1; - /* Set a valid swizzle for masked out components */ - assert(ins.mask); - unsigned first_component = __builtin_ffs(ins.mask) - 1; + for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) { + if (!(ins.mask & (1 << i))) + ins.swizzle[0][i] = first_component; + } - for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) { - if (!(ins.mask & (1 << i))) - ins.swizzle[0][i] = first_component; - } - - emit_mir_instruction(ctx, ins); + emit_mir_instruction(ctx, ins); } /* If is_shared is off, the only other possible value are globals, since @@ -1360,1346 +1361,1362 @@ emit_global( * `image_direct_address` should be ~0 when instr is not an image_atomic * and the destination register of a lea_image op when it is an image_atomic. */ static void -emit_atomic( - compiler_context *ctx, - nir_intrinsic_instr *instr, - bool is_shared, - midgard_load_store_op op, - unsigned image_direct_address) +emit_atomic(compiler_context *ctx, nir_intrinsic_instr *instr, bool is_shared, + midgard_load_store_op op, unsigned image_direct_address) { - nir_alu_type type = - (op == midgard_op_atomic_imin || op == midgard_op_atomic_imax) ? - nir_type_int : nir_type_uint; + nir_alu_type type = + (op == midgard_op_atomic_imin || op == midgard_op_atomic_imax) + ? nir_type_int + : nir_type_uint; - bool is_image = image_direct_address != ~0; + bool is_image = image_direct_address != ~0; - unsigned dest = nir_dest_index(&instr->dest); - unsigned val_src = is_image ? 3 : 1; - unsigned val = nir_src_index(ctx, &instr->src[val_src]); - unsigned bitsize = nir_src_bit_size(instr->src[val_src]); - emit_explicit_constant(ctx, val, val); + unsigned dest = nir_dest_index(&instr->dest); + unsigned val_src = is_image ? 3 : 1; + unsigned val = nir_src_index(ctx, &instr->src[val_src]); + unsigned bitsize = nir_src_bit_size(instr->src[val_src]); + emit_explicit_constant(ctx, val, val); - midgard_instruction ins = { - .type = TAG_LOAD_STORE_4, - .mask = 0xF, - .dest = dest, - .src = { ~0, ~0, ~0, val, }, - .src_types = { 0, 0, 0, type | bitsize, }, - .op = op - }; + midgard_instruction ins = {.type = TAG_LOAD_STORE_4, + .mask = 0xF, + .dest = dest, + .src = + { + ~0, + ~0, + ~0, + val, + }, + .src_types = + { + 0, + 0, + 0, + type | bitsize, + }, + .op = op}; - nir_src *src_offset = nir_get_io_offset_src(instr); + nir_src *src_offset = nir_get_io_offset_src(instr); - if (op == midgard_op_atomic_cmpxchg) { - unsigned xchg_val_src = is_image ? 4 : 2; - unsigned xchg_val = nir_src_index(ctx, &instr->src[xchg_val_src]); - emit_explicit_constant(ctx, xchg_val, xchg_val); + if (op == midgard_op_atomic_cmpxchg) { + unsigned xchg_val_src = is_image ? 4 : 2; + unsigned xchg_val = nir_src_index(ctx, &instr->src[xchg_val_src]); + emit_explicit_constant(ctx, xchg_val, xchg_val); - ins.src[2] = val; - ins.src_types[2] = type | bitsize; - ins.src[3] = xchg_val; + ins.src[2] = val; + ins.src_types[2] = type | bitsize; + ins.src[3] = xchg_val; - if (is_shared) { - ins.load_store.arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR; - ins.load_store.arg_comp = COMPONENT_Z; - ins.load_store.bitsize_toggle = true; - } else { - for(unsigned i = 0; i < 2; ++i) - ins.swizzle[1][i] = i; + if (is_shared) { + ins.load_store.arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR; + ins.load_store.arg_comp = COMPONENT_Z; + ins.load_store.bitsize_toggle = true; + } else { + for (unsigned i = 0; i < 2; ++i) + ins.swizzle[1][i] = i; - ins.src[1] = is_image ? image_direct_address : - nir_src_index(ctx, src_offset); - ins.src_types[1] = nir_type_uint64; - } - } else if (is_image) { - for(unsigned i = 0; i < 2; ++i) - ins.swizzle[2][i] = i; + ins.src[1] = + is_image ? image_direct_address : nir_src_index(ctx, src_offset); + ins.src_types[1] = nir_type_uint64; + } + } else if (is_image) { + for (unsigned i = 0; i < 2; ++i) + ins.swizzle[2][i] = i; - ins.src[2] = image_direct_address; - ins.src_types[2] = nir_type_uint64; + ins.src[2] = image_direct_address; + ins.src_types[2] = nir_type_uint64; - ins.load_store.arg_reg = REGISTER_LDST_ZERO; - ins.load_store.bitsize_toggle = true; - ins.load_store.index_format = midgard_index_address_u64; - } else - mir_set_offset(ctx, &ins, src_offset, is_shared ? LDST_SHARED : LDST_GLOBAL); + ins.load_store.arg_reg = REGISTER_LDST_ZERO; + ins.load_store.bitsize_toggle = true; + ins.load_store.index_format = midgard_index_address_u64; + } else + mir_set_offset(ctx, &ins, src_offset, + is_shared ? LDST_SHARED : LDST_GLOBAL); - mir_set_intr_mask(&instr->instr, &ins, true); + mir_set_intr_mask(&instr->instr, &ins, true); - emit_mir_instruction(ctx, ins); + emit_mir_instruction(ctx, ins); } static void -emit_varying_read( - compiler_context *ctx, - unsigned dest, unsigned offset, - unsigned nr_comp, unsigned component, - nir_src *indirect_offset, nir_alu_type type, bool flat) +emit_varying_read(compiler_context *ctx, unsigned dest, unsigned offset, + unsigned nr_comp, unsigned component, + nir_src *indirect_offset, nir_alu_type type, bool flat) { - midgard_instruction ins = m_ld_vary_32(dest, PACK_LDST_ATTRIB_OFS(offset)); - ins.mask = mask_of(nr_comp); - ins.dest_type = type; + midgard_instruction ins = m_ld_vary_32(dest, PACK_LDST_ATTRIB_OFS(offset)); + ins.mask = mask_of(nr_comp); + ins.dest_type = type; - if (type == nir_type_float16) { - /* Ensure we are aligned so we can pack it later */ - ins.mask = mask_of(ALIGN_POT(nr_comp, 2)); - } + if (type == nir_type_float16) { + /* Ensure we are aligned so we can pack it later */ + ins.mask = mask_of(ALIGN_POT(nr_comp, 2)); + } - for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) - ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W); + for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) + ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W); - midgard_varying_params p = { - .flat_shading = flat, - .perspective_correction = 1, - .interpolate_sample = true, - }; - midgard_pack_varying_params(&ins.load_store, p); + midgard_varying_params p = { + .flat_shading = flat, + .perspective_correction = 1, + .interpolate_sample = true, + }; + midgard_pack_varying_params(&ins.load_store, p); - if (indirect_offset) { - ins.src[2] = nir_src_index(ctx, indirect_offset); - ins.src_types[2] = nir_type_uint32; - } else - ins.load_store.index_reg = REGISTER_LDST_ZERO; + if (indirect_offset) { + ins.src[2] = nir_src_index(ctx, indirect_offset); + ins.src_types[2] = nir_type_uint32; + } else + ins.load_store.index_reg = REGISTER_LDST_ZERO; - ins.load_store.arg_reg = REGISTER_LDST_ZERO; - ins.load_store.index_format = midgard_index_address_u32; + ins.load_store.arg_reg = REGISTER_LDST_ZERO; + ins.load_store.index_format = midgard_index_address_u32; - /* For flat shading, we always use .u32 and require 32-bit mode. For - * smooth shading, we use the appropriate floating-point type. - * - * This could be optimized, but it makes it easy to check correctness. - */ - if (flat) { - assert(nir_alu_type_get_type_size(type) == 32); - ins.op = midgard_op_ld_vary_32u; - } else { - assert(nir_alu_type_get_base_type(type) == nir_type_float); + /* For flat shading, we always use .u32 and require 32-bit mode. For + * smooth shading, we use the appropriate floating-point type. + * + * This could be optimized, but it makes it easy to check correctness. + */ + if (flat) { + assert(nir_alu_type_get_type_size(type) == 32); + ins.op = midgard_op_ld_vary_32u; + } else { + assert(nir_alu_type_get_base_type(type) == nir_type_float); - ins.op = (nir_alu_type_get_type_size(type) == 32) ? - midgard_op_ld_vary_32 : - midgard_op_ld_vary_16; - } + ins.op = (nir_alu_type_get_type_size(type) == 32) ? midgard_op_ld_vary_32 + : midgard_op_ld_vary_16; + } - emit_mir_instruction(ctx, ins); + emit_mir_instruction(ctx, ins); } - -/* If `is_atomic` is true, we emit a `lea_image` since midgard doesn't not have special - * image_atomic opcodes. The caller can then use that address to emit a normal atomic opcode. */ +/* If `is_atomic` is true, we emit a `lea_image` since midgard doesn't not have + * special image_atomic opcodes. The caller can then use that address to emit a + * normal atomic opcode. */ static midgard_instruction emit_image_op(compiler_context *ctx, nir_intrinsic_instr *instr, bool is_atomic) { - enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); - unsigned nr_attr = ctx->stage == MESA_SHADER_VERTEX ? - util_bitcount64(ctx->nir->info.inputs_read) : 0; - unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); - bool is_array = nir_intrinsic_image_array(instr); - bool is_store = instr->intrinsic == nir_intrinsic_image_store; + enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); + unsigned nr_attr = ctx->stage == MESA_SHADER_VERTEX + ? util_bitcount64(ctx->nir->info.inputs_read) + : 0; + unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); + bool is_array = nir_intrinsic_image_array(instr); + bool is_store = instr->intrinsic == nir_intrinsic_image_store; - /* TODO: MSAA */ - assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); + /* TODO: MSAA */ + assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); - unsigned coord_reg = nir_src_index(ctx, &instr->src[1]); - emit_explicit_constant(ctx, coord_reg, coord_reg); + unsigned coord_reg = nir_src_index(ctx, &instr->src[1]); + emit_explicit_constant(ctx, coord_reg, coord_reg); - nir_src *index = &instr->src[0]; - bool is_direct = nir_src_is_const(*index); + nir_src *index = &instr->src[0]; + bool is_direct = nir_src_is_const(*index); - /* For image opcodes, address is used as an index into the attribute descriptor */ - unsigned address = nr_attr; - if (is_direct) - address += nir_src_as_uint(*index); + /* For image opcodes, address is used as an index into the attribute + * descriptor */ + unsigned address = nr_attr; + if (is_direct) + address += nir_src_as_uint(*index); - midgard_instruction ins; - if (is_store) { /* emit st_image_* */ - unsigned val = nir_src_index(ctx, &instr->src[3]); - emit_explicit_constant(ctx, val, val); + midgard_instruction ins; + if (is_store) { /* emit st_image_* */ + unsigned val = nir_src_index(ctx, &instr->src[3]); + emit_explicit_constant(ctx, val, val); - nir_alu_type type = nir_intrinsic_src_type(instr); - ins = st_image(type, val, PACK_LDST_ATTRIB_OFS(address)); - nir_alu_type base_type = nir_alu_type_get_base_type(type); - ins.src_types[0] = base_type | nir_src_bit_size(instr->src[3]); - } else if (is_atomic) { /* emit lea_image */ - unsigned dest = make_compiler_temp_reg(ctx); - ins = m_lea_image(dest, PACK_LDST_ATTRIB_OFS(address)); - ins.mask = mask_of(2); /* 64-bit memory address */ - } else { /* emit ld_image_* */ - nir_alu_type type = nir_intrinsic_dest_type(instr); - ins = ld_image(type, nir_dest_index(&instr->dest), PACK_LDST_ATTRIB_OFS(address)); - ins.mask = mask_of(nir_intrinsic_dest_components(instr)); - ins.dest_type = type; - } + nir_alu_type type = nir_intrinsic_src_type(instr); + ins = st_image(type, val, PACK_LDST_ATTRIB_OFS(address)); + nir_alu_type base_type = nir_alu_type_get_base_type(type); + ins.src_types[0] = base_type | nir_src_bit_size(instr->src[3]); + } else if (is_atomic) { /* emit lea_image */ + unsigned dest = make_compiler_temp_reg(ctx); + ins = m_lea_image(dest, PACK_LDST_ATTRIB_OFS(address)); + ins.mask = mask_of(2); /* 64-bit memory address */ + } else { /* emit ld_image_* */ + nir_alu_type type = nir_intrinsic_dest_type(instr); + ins = ld_image(type, nir_dest_index(&instr->dest), + PACK_LDST_ATTRIB_OFS(address)); + ins.mask = mask_of(nir_intrinsic_dest_components(instr)); + ins.dest_type = type; + } - /* Coord reg */ - ins.src[1] = coord_reg; - ins.src_types[1] = nir_type_uint16; - if (nr_dim == 3 || is_array) { - ins.load_store.bitsize_toggle = true; - } + /* Coord reg */ + ins.src[1] = coord_reg; + ins.src_types[1] = nir_type_uint16; + if (nr_dim == 3 || is_array) { + ins.load_store.bitsize_toggle = true; + } - /* Image index reg */ - if (!is_direct) { - ins.src[2] = nir_src_index(ctx, index); - ins.src_types[2] = nir_type_uint32; - } else - ins.load_store.index_reg = REGISTER_LDST_ZERO; + /* Image index reg */ + if (!is_direct) { + ins.src[2] = nir_src_index(ctx, index); + ins.src_types[2] = nir_type_uint32; + } else + ins.load_store.index_reg = REGISTER_LDST_ZERO; - emit_mir_instruction(ctx, ins); + emit_mir_instruction(ctx, ins); - return ins; + return ins; } static void -emit_attr_read( - compiler_context *ctx, - unsigned dest, unsigned offset, - unsigned nr_comp, nir_alu_type t) +emit_attr_read(compiler_context *ctx, unsigned dest, unsigned offset, + unsigned nr_comp, nir_alu_type t) { - midgard_instruction ins = m_ld_attr_32(dest, PACK_LDST_ATTRIB_OFS(offset)); - ins.load_store.arg_reg = REGISTER_LDST_ZERO; - ins.load_store.index_reg = REGISTER_LDST_ZERO; - ins.mask = mask_of(nr_comp); + midgard_instruction ins = m_ld_attr_32(dest, PACK_LDST_ATTRIB_OFS(offset)); + ins.load_store.arg_reg = REGISTER_LDST_ZERO; + ins.load_store.index_reg = REGISTER_LDST_ZERO; + ins.mask = mask_of(nr_comp); - /* Use the type appropriate load */ - switch (t) { - case nir_type_uint: - case nir_type_bool: - ins.op = midgard_op_ld_attr_32u; - break; - case nir_type_int: - ins.op = midgard_op_ld_attr_32i; - break; - case nir_type_float: - ins.op = midgard_op_ld_attr_32; - break; - default: - unreachable("Attempted to load unknown type"); - break; - } + /* Use the type appropriate load */ + switch (t) { + case nir_type_uint: + case nir_type_bool: + ins.op = midgard_op_ld_attr_32u; + break; + case nir_type_int: + ins.op = midgard_op_ld_attr_32i; + break; + case nir_type_float: + ins.op = midgard_op_ld_attr_32; + break; + default: + unreachable("Attempted to load unknown type"); + break; + } - emit_mir_instruction(ctx, ins); + emit_mir_instruction(ctx, ins); } static void emit_sysval_read(compiler_context *ctx, nir_instr *instr, - unsigned nr_components, unsigned offset) + unsigned nr_components, unsigned offset) { - nir_dest nir_dest; + nir_dest nir_dest; - /* Figure out which uniform this is */ - unsigned sysval_ubo = ctx->inputs->fixed_sysval_ubo >= 0 ? - ctx->inputs->fixed_sysval_ubo : - ctx->nir->info.num_ubos; - int sysval = panfrost_sysval_for_instr(instr, &nir_dest); - unsigned dest = nir_dest_index(&nir_dest); - unsigned uniform = - pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval); + /* Figure out which uniform this is */ + unsigned sysval_ubo = ctx->inputs->fixed_sysval_ubo >= 0 + ? ctx->inputs->fixed_sysval_ubo + : ctx->nir->info.num_ubos; + int sysval = panfrost_sysval_for_instr(instr, &nir_dest); + unsigned dest = nir_dest_index(&nir_dest); + unsigned uniform = + pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval); - /* Emit the read itself -- this is never indirect */ - midgard_instruction *ins = - emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0, - sysval_ubo, nr_components); + /* Emit the read itself -- this is never indirect */ + midgard_instruction *ins = + emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0, + sysval_ubo, nr_components); - ins->mask = mask_of(nr_components); + ins->mask = mask_of(nr_components); } static unsigned compute_builtin_arg(nir_intrinsic_op op) { - switch (op) { - case nir_intrinsic_load_workgroup_id: - return REGISTER_LDST_GROUP_ID; - case nir_intrinsic_load_local_invocation_id: - return REGISTER_LDST_LOCAL_THREAD_ID; - case nir_intrinsic_load_global_invocation_id: - case nir_intrinsic_load_global_invocation_id_zero_base: - return REGISTER_LDST_GLOBAL_THREAD_ID; - default: - unreachable("Invalid compute paramater loaded"); - } + switch (op) { + case nir_intrinsic_load_workgroup_id: + return REGISTER_LDST_GROUP_ID; + case nir_intrinsic_load_local_invocation_id: + return REGISTER_LDST_LOCAL_THREAD_ID; + case nir_intrinsic_load_global_invocation_id: + case nir_intrinsic_load_global_invocation_id_zero_base: + return REGISTER_LDST_GLOBAL_THREAD_ID; + default: + unreachable("Invalid compute paramater loaded"); + } } static void -emit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, unsigned src_s, - enum midgard_rt_id rt, unsigned sample_iter) +emit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, + unsigned src_s, enum midgard_rt_id rt, unsigned sample_iter) { - assert(rt < ARRAY_SIZE(ctx->writeout_branch)); - assert(sample_iter < ARRAY_SIZE(ctx->writeout_branch[0])); + assert(rt < ARRAY_SIZE(ctx->writeout_branch)); + assert(sample_iter < ARRAY_SIZE(ctx->writeout_branch[0])); - midgard_instruction *br = ctx->writeout_branch[rt][sample_iter]; + midgard_instruction *br = ctx->writeout_branch[rt][sample_iter]; - assert(!br); + assert(!br); - emit_explicit_constant(ctx, src, src); + emit_explicit_constant(ctx, src, src); - struct midgard_instruction ins = - v_branch(false, false); + struct midgard_instruction ins = v_branch(false, false); - bool depth_only = (rt == MIDGARD_ZS_RT); + bool depth_only = (rt == MIDGARD_ZS_RT); - ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C; + ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C; - /* Add dependencies */ - ins.src[0] = src; - ins.src_types[0] = nir_type_uint32; + /* Add dependencies */ + ins.src[0] = src; + ins.src_types[0] = nir_type_uint32; - if (depth_only) - ins.constants.u32[0] = 0xFF; - else - ins.constants.u32[0] = ((rt - MIDGARD_COLOR_RT0) << 8) | sample_iter; + if (depth_only) + ins.constants.u32[0] = 0xFF; + else + ins.constants.u32[0] = ((rt - MIDGARD_COLOR_RT0) << 8) | sample_iter; - for (int i = 0; i < 4; ++i) - ins.swizzle[0][i] = i; + for (int i = 0; i < 4; ++i) + ins.swizzle[0][i] = i; - if (~src_z) { - emit_explicit_constant(ctx, src_z, src_z); - ins.src[2] = src_z; - ins.src_types[2] = nir_type_uint32; - ins.writeout |= PAN_WRITEOUT_Z; - } - if (~src_s) { - emit_explicit_constant(ctx, src_s, src_s); - ins.src[3] = src_s; - ins.src_types[3] = nir_type_uint32; - ins.writeout |= PAN_WRITEOUT_S; - } + if (~src_z) { + emit_explicit_constant(ctx, src_z, src_z); + ins.src[2] = src_z; + ins.src_types[2] = nir_type_uint32; + ins.writeout |= PAN_WRITEOUT_Z; + } + if (~src_s) { + emit_explicit_constant(ctx, src_s, src_s); + ins.src[3] = src_s; + ins.src_types[3] = nir_type_uint32; + ins.writeout |= PAN_WRITEOUT_S; + } - /* Emit the branch */ - br = emit_mir_instruction(ctx, ins); - schedule_barrier(ctx); - ctx->writeout_branch[rt][sample_iter] = br; + /* Emit the branch */ + br = emit_mir_instruction(ctx, ins); + schedule_barrier(ctx); + ctx->writeout_branch[rt][sample_iter] = br; - /* Push our current location = current block count - 1 = where we'll - * jump to. Maybe a bit too clever for my own good */ + /* Push our current location = current block count - 1 = where we'll + * jump to. Maybe a bit too clever for my own good */ - br->branch.target_block = ctx->block_count - 1; + br->branch.target_block = ctx->block_count - 1; } static void emit_compute_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) { - unsigned reg = nir_dest_index(&instr->dest); - midgard_instruction ins = m_ldst_mov(reg, 0); - ins.mask = mask_of(3); - ins.swizzle[0][3] = COMPONENT_X; /* xyzx */ - ins.load_store.arg_reg = compute_builtin_arg(instr->intrinsic); - emit_mir_instruction(ctx, ins); + unsigned reg = nir_dest_index(&instr->dest); + midgard_instruction ins = m_ldst_mov(reg, 0); + ins.mask = mask_of(3); + ins.swizzle[0][3] = COMPONENT_X; /* xyzx */ + ins.load_store.arg_reg = compute_builtin_arg(instr->intrinsic); + emit_mir_instruction(ctx, ins); } static unsigned vertex_builtin_arg(nir_intrinsic_op op) { - switch (op) { - case nir_intrinsic_load_vertex_id_zero_base: - return PAN_VERTEX_ID; - case nir_intrinsic_load_instance_id: - return PAN_INSTANCE_ID; - default: - unreachable("Invalid vertex builtin"); - } + switch (op) { + case nir_intrinsic_load_vertex_id_zero_base: + return PAN_VERTEX_ID; + case nir_intrinsic_load_instance_id: + return PAN_INSTANCE_ID; + default: + unreachable("Invalid vertex builtin"); + } } static void emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) { - unsigned reg = nir_dest_index(&instr->dest); - emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int); + unsigned reg = nir_dest_index(&instr->dest); + emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, + nir_type_int); } static void emit_special(compiler_context *ctx, nir_intrinsic_instr *instr, unsigned idx) { - unsigned reg = nir_dest_index(&instr->dest); + unsigned reg = nir_dest_index(&instr->dest); - midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0); - ld.op = midgard_op_ld_special_32u; - ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(idx); - ld.load_store.index_reg = REGISTER_LDST_ZERO; + midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0); + ld.op = midgard_op_ld_special_32u; + ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(idx); + ld.load_store.index_reg = REGISTER_LDST_ZERO; - for (int i = 0; i < 4; ++i) - ld.swizzle[0][i] = COMPONENT_X; + for (int i = 0; i < 4; ++i) + ld.swizzle[0][i] = COMPONENT_X; - emit_mir_instruction(ctx, ld); + emit_mir_instruction(ctx, ld); } static void emit_control_barrier(compiler_context *ctx) { - midgard_instruction ins = { - .type = TAG_TEXTURE_4, - .dest = ~0, - .src = { ~0, ~0, ~0, ~0 }, - .op = midgard_tex_op_barrier, - }; + midgard_instruction ins = { + .type = TAG_TEXTURE_4, + .dest = ~0, + .src = {~0, ~0, ~0, ~0}, + .op = midgard_tex_op_barrier, + }; - emit_mir_instruction(ctx, ins); + emit_mir_instruction(ctx, ins); } static unsigned mir_get_branch_cond(nir_src *src, bool *invert) { - /* Wrap it. No swizzle since it's a scalar */ + /* Wrap it. No swizzle since it's a scalar */ - nir_alu_src alu = { - .src = *src - }; + nir_alu_src alu = {.src = *src}; - *invert = pan_has_source_mod(&alu, nir_op_inot); - return nir_src_index(NULL, &alu.src); + *invert = pan_has_source_mod(&alu, nir_op_inot); + return nir_src_index(NULL, &alu.src); } static uint8_t output_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr) { - if (ctx->inputs->is_blend) - return MIDGARD_COLOR_RT0 + ctx->inputs->blend.rt; + if (ctx->inputs->is_blend) + return MIDGARD_COLOR_RT0 + ctx->inputs->blend.rt; - unsigned loc = nir_intrinsic_io_semantics(instr).location; + unsigned loc = nir_intrinsic_io_semantics(instr).location; - if (loc >= FRAG_RESULT_DATA0) - return loc - FRAG_RESULT_DATA0; + if (loc >= FRAG_RESULT_DATA0) + return loc - FRAG_RESULT_DATA0; - if (loc == FRAG_RESULT_DEPTH) - return 0x1F; - if (loc == FRAG_RESULT_STENCIL) - return 0x1E; + if (loc == FRAG_RESULT_DEPTH) + return 0x1F; + if (loc == FRAG_RESULT_STENCIL) + return 0x1E; - unreachable("Invalid RT to load from"); + unreachable("Invalid RT to load from"); } static void emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) { - unsigned offset = 0, reg; - - switch (instr->intrinsic) { - case nir_intrinsic_discard_if: - case nir_intrinsic_discard: { - bool conditional = instr->intrinsic == nir_intrinsic_discard_if; - struct midgard_instruction discard = v_branch(conditional, false); - discard.branch.target_type = TARGET_DISCARD; - - if (conditional) { - discard.src[0] = mir_get_branch_cond(&instr->src[0], - &discard.branch.invert_conditional); - discard.src_types[0] = nir_type_uint32; - } - - emit_mir_instruction(ctx, discard); - schedule_barrier(ctx); - - break; - } - - case nir_intrinsic_image_load: - case nir_intrinsic_image_store: - emit_image_op(ctx, instr, false); - break; - - case nir_intrinsic_image_size: { - unsigned nr_comp = nir_intrinsic_dest_components(instr); - emit_sysval_read(ctx, &instr->instr, nr_comp, 0); - break; - } - - case nir_intrinsic_load_ubo: - case nir_intrinsic_load_global: - case nir_intrinsic_load_global_constant: - case nir_intrinsic_load_shared: - case nir_intrinsic_load_scratch: - case nir_intrinsic_load_input: - case nir_intrinsic_load_interpolated_input: { - bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; - bool is_global = instr->intrinsic == nir_intrinsic_load_global || - instr->intrinsic == nir_intrinsic_load_global_constant; - bool is_shared = instr->intrinsic == nir_intrinsic_load_shared; - bool is_scratch = instr->intrinsic == nir_intrinsic_load_scratch; - bool is_flat = instr->intrinsic == nir_intrinsic_load_input; - bool is_interp = instr->intrinsic == nir_intrinsic_load_interpolated_input; - - /* Get the base type of the intrinsic */ - /* TODO: Infer type? Does it matter? */ - nir_alu_type t = - (is_interp) ? nir_type_float : - (is_flat) ? nir_intrinsic_dest_type(instr) : - nir_type_uint; - - t = nir_alu_type_get_base_type(t); - - if (!(is_ubo || is_global || is_scratch)) { - offset = nir_intrinsic_base(instr); - } - - unsigned nr_comp = nir_intrinsic_dest_components(instr); - - nir_src *src_offset = nir_get_io_offset_src(instr); - - bool direct = nir_src_is_const(*src_offset); - nir_src *indirect_offset = direct ? NULL : src_offset; - - if (direct) - offset += nir_src_as_uint(*src_offset); - - /* We may need to apply a fractional offset */ - int component = (is_flat || is_interp) ? - nir_intrinsic_component(instr) : 0; - reg = nir_dest_index(&instr->dest); - - if (is_ubo) { - nir_src index = instr->src[0]; - - /* TODO: Is indirect block number possible? */ - assert(nir_src_is_const(index)); - - uint32_t uindex = nir_src_as_uint(index); - emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex, nr_comp); - } else if (is_global || is_shared || is_scratch) { - unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH); - emit_global(ctx, &instr->instr, true, reg, src_offset, seg); - } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) { - emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat); - } else if (ctx->inputs->is_blend) { - /* ctx->blend_input will be precoloured to r0/r2, where - * the input is preloaded */ - - unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input; - - if (*input == ~0) - *input = reg; - else - emit_mir_instruction(ctx, v_mov(*input, reg)); - } else if (ctx->stage == MESA_SHADER_VERTEX) { - emit_attr_read(ctx, reg, offset, nr_comp, t); - } else { - DBG("Unknown load\n"); - assert(0); - } - - break; - } - - /* Handled together with load_interpolated_input */ - case nir_intrinsic_load_barycentric_pixel: - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - break; - - /* Reads 128-bit value raw off the tilebuffer during blending, tasty */ - - case nir_intrinsic_load_raw_output_pan: { - reg = nir_dest_index(&instr->dest); - - /* T720 and below use different blend opcodes with slightly - * different semantics than T760 and up */ - - midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0); - - unsigned target = output_load_rt_addr(ctx, instr); - ld.load_store.index_comp = target & 0x3; - ld.load_store.index_reg = target >> 2; - - if (nir_src_is_const(instr->src[0])) { - unsigned sample = nir_src_as_uint(instr->src[0]); - ld.load_store.arg_comp = sample & 0x3; - ld.load_store.arg_reg = sample >> 2; - } else { - /* Enable sample index via register. */ - ld.load_store.signed_offset |= 1; - ld.src[1] = nir_src_index(ctx, &instr->src[0]); - ld.src_types[1] = nir_type_int32; - } - - if (ctx->quirks & MIDGARD_OLD_BLEND) { - ld.op = midgard_op_ld_special_32u; - ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(16); - ld.load_store.index_reg = REGISTER_LDST_ZERO; - } - - emit_mir_instruction(ctx, ld); - break; - } - - case nir_intrinsic_load_output: { - reg = nir_dest_index(&instr->dest); - - unsigned bits = nir_dest_bit_size(instr->dest); - - midgard_instruction ld; - if (bits == 16) - ld = m_ld_tilebuffer_16f(reg, 0); - else - ld = m_ld_tilebuffer_32f(reg, 0); - - unsigned index = output_load_rt_addr(ctx, instr); - ld.load_store.index_comp = index & 0x3; - ld.load_store.index_reg = index >> 2; - - for (unsigned c = 4; c < 16; ++c) - ld.swizzle[0][c] = 0; - - if (ctx->quirks & MIDGARD_OLD_BLEND) { - if (bits == 16) - ld.op = midgard_op_ld_special_16f; - else - ld.op = midgard_op_ld_special_32f; - ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(1); - ld.load_store.index_reg = REGISTER_LDST_ZERO; - } - - emit_mir_instruction(ctx, ld); - break; - } - - case nir_intrinsic_store_output: - case nir_intrinsic_store_combined_output_pan: - assert(nir_src_is_const(instr->src[1]) && "no indirect outputs"); - - reg = nir_src_index(ctx, &instr->src[0]); - - if (ctx->stage == MESA_SHADER_FRAGMENT) { - bool combined = instr->intrinsic == - nir_intrinsic_store_combined_output_pan; - - enum midgard_rt_id rt; - - unsigned reg_z = ~0, reg_s = ~0, reg_2 = ~0; - unsigned writeout = PAN_WRITEOUT_C; - if (combined) { - writeout = nir_intrinsic_component(instr); - if (writeout & PAN_WRITEOUT_Z) - reg_z = nir_src_index(ctx, &instr->src[2]); - if (writeout & PAN_WRITEOUT_S) - reg_s = nir_src_index(ctx, &instr->src[3]); - if (writeout & PAN_WRITEOUT_2) - reg_2 = nir_src_index(ctx, &instr->src[4]); - } - - if (writeout & PAN_WRITEOUT_C) { - nir_io_semantics sem = nir_intrinsic_io_semantics(instr); - - rt = MIDGARD_COLOR_RT0 + - (sem.location - FRAG_RESULT_DATA0); - } else { - rt = MIDGARD_ZS_RT; - } - - /* Dual-source blend writeout is done by leaving the - * value in r2 for the blend shader to use. */ - if (~reg_2) { - if (instr->src[4].is_ssa) { - emit_explicit_constant(ctx, reg_2, reg_2); - - unsigned out = make_compiler_temp(ctx); - - midgard_instruction ins = v_mov(reg_2, out); - emit_mir_instruction(ctx, ins); - - ctx->blend_src1 = out; - } else { - ctx->blend_src1 = reg_2; - } - } - - emit_fragment_store(ctx, reg, reg_z, reg_s, rt, 0); - } else if (ctx->stage == MESA_SHADER_VERTEX) { - assert(instr->intrinsic == nir_intrinsic_store_output); - - /* We should have been vectorized, though we don't - * currently check that st_vary is emitted only once - * per slot (this is relevant, since there's not a mask - * parameter available on the store [set to 0 by the - * blob]). We do respect the component by adjusting the - * swizzle. If this is a constant source, we'll need to - * emit that explicitly. */ - - emit_explicit_constant(ctx, reg, reg); - - offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]); - - unsigned dst_component = nir_intrinsic_component(instr); - unsigned nr_comp = nir_src_num_components(instr->src[0]); - - /* ABI: Format controlled by the attribute descriptor. - * This simplifies flat shading, although it prevents - * certain (unimplemented) 16-bit optimizations. - * - * In particular, it lets the driver handle internal - * TGSI shaders that set flat in the VS but smooth in - * the FS. This matches our handling on Bifrost. - */ - bool auto32 = true; - assert(nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)) == 32); - - /* ABI: varyings in the secondary attribute table */ - bool secondary_table = true; - - midgard_instruction st = m_st_vary_32(reg, PACK_LDST_ATTRIB_OFS(offset)); - st.load_store.arg_reg = REGISTER_LDST_ZERO; - st.load_store.index_reg = REGISTER_LDST_ZERO; - - /* Attribute instruction uses these 2-bits for the - * a32 and table bits, pack this specially. - */ - st.load_store.index_format = (auto32 ? (1 << 0) : 0) | - (secondary_table ? (1 << 1) : 0); - - /* nir_intrinsic_component(store_intr) encodes the - * destination component start. Source component offset - * adjustment is taken care of in - * install_registers_instr(), when offset_swizzle() is - * called. - */ - unsigned src_component = COMPONENT_X; - - assert(nr_comp > 0); - for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) { - st.swizzle[0][i] = src_component; - if (i >= dst_component && i < dst_component + nr_comp - 1) - src_component++; - } - - emit_mir_instruction(ctx, st); - } else { - DBG("Unknown store\n"); - assert(0); - } - - break; - - /* Special case of store_output for lowered blend shaders */ - case nir_intrinsic_store_raw_output_pan: - assert (ctx->stage == MESA_SHADER_FRAGMENT); - reg = nir_src_index(ctx, &instr->src[0]); - for (unsigned s = 0; s < ctx->blend_sample_iterations; s++) - emit_fragment_store(ctx, reg, ~0, ~0, - ctx->inputs->blend.rt + MIDGARD_COLOR_RT0, - s); - break; - - case nir_intrinsic_store_global: - case nir_intrinsic_store_shared: - case nir_intrinsic_store_scratch: - reg = nir_src_index(ctx, &instr->src[0]); - emit_explicit_constant(ctx, reg, reg); - - unsigned seg; - if (instr->intrinsic == nir_intrinsic_store_global) - seg = LDST_GLOBAL; - else if (instr->intrinsic == nir_intrinsic_store_shared) - seg = LDST_SHARED; - else - seg = LDST_SCRATCH; - - emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg); - break; - - case nir_intrinsic_load_ssbo_address: - case nir_intrinsic_load_xfb_address: - emit_sysval_read(ctx, &instr->instr, 2, 0); - break; - - case nir_intrinsic_load_first_vertex: - case nir_intrinsic_load_work_dim: - case nir_intrinsic_load_num_vertices: - emit_sysval_read(ctx, &instr->instr, 1, 0); - break; - - case nir_intrinsic_load_base_vertex: - emit_sysval_read(ctx, &instr->instr, 1, 4); - break; - - case nir_intrinsic_load_base_instance: - case nir_intrinsic_get_ssbo_size: - emit_sysval_read(ctx, &instr->instr, 1, 8); - break; - - case nir_intrinsic_load_sample_positions_pan: - emit_sysval_read(ctx, &instr->instr, 2, 0); - break; - - case nir_intrinsic_load_viewport_scale: - case nir_intrinsic_load_viewport_offset: - case nir_intrinsic_load_num_workgroups: - case nir_intrinsic_load_sampler_lod_parameters_pan: - case nir_intrinsic_load_workgroup_size: - emit_sysval_read(ctx, &instr->instr, 3, 0); - break; - - case nir_intrinsic_load_blend_const_color_rgba: - emit_sysval_read(ctx, &instr->instr, 4, 0); - break; - - case nir_intrinsic_load_workgroup_id: - case nir_intrinsic_load_local_invocation_id: - case nir_intrinsic_load_global_invocation_id: - case nir_intrinsic_load_global_invocation_id_zero_base: - emit_compute_builtin(ctx, instr); - break; - - case nir_intrinsic_load_vertex_id_zero_base: - case nir_intrinsic_load_instance_id: - emit_vertex_builtin(ctx, instr); - break; - - case nir_intrinsic_load_sample_mask_in: - emit_special(ctx, instr, 96); - break; - - case nir_intrinsic_load_sample_id: - emit_special(ctx, instr, 97); - break; - - /* Midgard doesn't seem to want special handling, though we do need to - * take care when scheduling to avoid incorrect reordering. - */ - case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - case nir_intrinsic_group_memory_barrier: - schedule_barrier(ctx); - break; - - case nir_intrinsic_control_barrier: - schedule_barrier(ctx); - emit_control_barrier(ctx); - schedule_barrier(ctx); - break; - - ATOMIC_CASE(ctx, instr, add, add); - ATOMIC_CASE(ctx, instr, and, and); - ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg); - ATOMIC_CASE(ctx, instr, exchange, xchg); - ATOMIC_CASE(ctx, instr, imax, imax); - ATOMIC_CASE(ctx, instr, imin, imin); - ATOMIC_CASE(ctx, instr, or, or); - ATOMIC_CASE(ctx, instr, umax, umax); - ATOMIC_CASE(ctx, instr, umin, umin); - ATOMIC_CASE(ctx, instr, xor, xor); - - IMAGE_ATOMIC_CASE(ctx, instr, add, add); - IMAGE_ATOMIC_CASE(ctx, instr, and, and); - IMAGE_ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg); - IMAGE_ATOMIC_CASE(ctx, instr, exchange, xchg); - IMAGE_ATOMIC_CASE(ctx, instr, imax, imax); - IMAGE_ATOMIC_CASE(ctx, instr, imin, imin); - IMAGE_ATOMIC_CASE(ctx, instr, or, or); - IMAGE_ATOMIC_CASE(ctx, instr, umax, umax); - IMAGE_ATOMIC_CASE(ctx, instr, umin, umin); - IMAGE_ATOMIC_CASE(ctx, instr, xor, xor); - - default: - fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); - assert(0); - break; - } + unsigned offset = 0, reg; + + switch (instr->intrinsic) { + case nir_intrinsic_discard_if: + case nir_intrinsic_discard: { + bool conditional = instr->intrinsic == nir_intrinsic_discard_if; + struct midgard_instruction discard = v_branch(conditional, false); + discard.branch.target_type = TARGET_DISCARD; + + if (conditional) { + discard.src[0] = mir_get_branch_cond( + &instr->src[0], &discard.branch.invert_conditional); + discard.src_types[0] = nir_type_uint32; + } + + emit_mir_instruction(ctx, discard); + schedule_barrier(ctx); + + break; + } + + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + emit_image_op(ctx, instr, false); + break; + + case nir_intrinsic_image_size: { + unsigned nr_comp = nir_intrinsic_dest_components(instr); + emit_sysval_read(ctx, &instr->instr, nr_comp, 0); + break; + } + + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: + case nir_intrinsic_load_shared: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_input: + case nir_intrinsic_load_interpolated_input: { + bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; + bool is_global = instr->intrinsic == nir_intrinsic_load_global || + instr->intrinsic == nir_intrinsic_load_global_constant; + bool is_shared = instr->intrinsic == nir_intrinsic_load_shared; + bool is_scratch = instr->intrinsic == nir_intrinsic_load_scratch; + bool is_flat = instr->intrinsic == nir_intrinsic_load_input; + bool is_interp = + instr->intrinsic == nir_intrinsic_load_interpolated_input; + + /* Get the base type of the intrinsic */ + /* TODO: Infer type? Does it matter? */ + nir_alu_type t = (is_interp) ? nir_type_float + : (is_flat) ? nir_intrinsic_dest_type(instr) + : nir_type_uint; + + t = nir_alu_type_get_base_type(t); + + if (!(is_ubo || is_global || is_scratch)) { + offset = nir_intrinsic_base(instr); + } + + unsigned nr_comp = nir_intrinsic_dest_components(instr); + + nir_src *src_offset = nir_get_io_offset_src(instr); + + bool direct = nir_src_is_const(*src_offset); + nir_src *indirect_offset = direct ? NULL : src_offset; + + if (direct) + offset += nir_src_as_uint(*src_offset); + + /* We may need to apply a fractional offset */ + int component = + (is_flat || is_interp) ? nir_intrinsic_component(instr) : 0; + reg = nir_dest_index(&instr->dest); + + if (is_ubo) { + nir_src index = instr->src[0]; + + /* TODO: Is indirect block number possible? */ + assert(nir_src_is_const(index)); + + uint32_t uindex = nir_src_as_uint(index); + emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, + uindex, nr_comp); + } else if (is_global || is_shared || is_scratch) { + unsigned seg = + is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH); + emit_global(ctx, &instr->instr, true, reg, src_offset, seg); + } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) { + emit_varying_read(ctx, reg, offset, nr_comp, component, + indirect_offset, t | nir_dest_bit_size(instr->dest), + is_flat); + } else if (ctx->inputs->is_blend) { + /* ctx->blend_input will be precoloured to r0/r2, where + * the input is preloaded */ + + unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input; + + if (*input == ~0) + *input = reg; + else + emit_mir_instruction(ctx, v_mov(*input, reg)); + } else if (ctx->stage == MESA_SHADER_VERTEX) { + emit_attr_read(ctx, reg, offset, nr_comp, t); + } else { + DBG("Unknown load\n"); + assert(0); + } + + break; + } + + /* Handled together with load_interpolated_input */ + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + break; + + /* Reads 128-bit value raw off the tilebuffer during blending, tasty */ + + case nir_intrinsic_load_raw_output_pan: { + reg = nir_dest_index(&instr->dest); + + /* T720 and below use different blend opcodes with slightly + * different semantics than T760 and up */ + + midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0); + + unsigned target = output_load_rt_addr(ctx, instr); + ld.load_store.index_comp = target & 0x3; + ld.load_store.index_reg = target >> 2; + + if (nir_src_is_const(instr->src[0])) { + unsigned sample = nir_src_as_uint(instr->src[0]); + ld.load_store.arg_comp = sample & 0x3; + ld.load_store.arg_reg = sample >> 2; + } else { + /* Enable sample index via register. */ + ld.load_store.signed_offset |= 1; + ld.src[1] = nir_src_index(ctx, &instr->src[0]); + ld.src_types[1] = nir_type_int32; + } + + if (ctx->quirks & MIDGARD_OLD_BLEND) { + ld.op = midgard_op_ld_special_32u; + ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(16); + ld.load_store.index_reg = REGISTER_LDST_ZERO; + } + + emit_mir_instruction(ctx, ld); + break; + } + + case nir_intrinsic_load_output: { + reg = nir_dest_index(&instr->dest); + + unsigned bits = nir_dest_bit_size(instr->dest); + + midgard_instruction ld; + if (bits == 16) + ld = m_ld_tilebuffer_16f(reg, 0); + else + ld = m_ld_tilebuffer_32f(reg, 0); + + unsigned index = output_load_rt_addr(ctx, instr); + ld.load_store.index_comp = index & 0x3; + ld.load_store.index_reg = index >> 2; + + for (unsigned c = 4; c < 16; ++c) + ld.swizzle[0][c] = 0; + + if (ctx->quirks & MIDGARD_OLD_BLEND) { + if (bits == 16) + ld.op = midgard_op_ld_special_16f; + else + ld.op = midgard_op_ld_special_32f; + ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(1); + ld.load_store.index_reg = REGISTER_LDST_ZERO; + } + + emit_mir_instruction(ctx, ld); + break; + } + + case nir_intrinsic_store_output: + case nir_intrinsic_store_combined_output_pan: + assert(nir_src_is_const(instr->src[1]) && "no indirect outputs"); + + reg = nir_src_index(ctx, &instr->src[0]); + + if (ctx->stage == MESA_SHADER_FRAGMENT) { + bool combined = + instr->intrinsic == nir_intrinsic_store_combined_output_pan; + + enum midgard_rt_id rt; + + unsigned reg_z = ~0, reg_s = ~0, reg_2 = ~0; + unsigned writeout = PAN_WRITEOUT_C; + if (combined) { + writeout = nir_intrinsic_component(instr); + if (writeout & PAN_WRITEOUT_Z) + reg_z = nir_src_index(ctx, &instr->src[2]); + if (writeout & PAN_WRITEOUT_S) + reg_s = nir_src_index(ctx, &instr->src[3]); + if (writeout & PAN_WRITEOUT_2) + reg_2 = nir_src_index(ctx, &instr->src[4]); + } + + if (writeout & PAN_WRITEOUT_C) { + nir_io_semantics sem = nir_intrinsic_io_semantics(instr); + + rt = MIDGARD_COLOR_RT0 + (sem.location - FRAG_RESULT_DATA0); + } else { + rt = MIDGARD_ZS_RT; + } + + /* Dual-source blend writeout is done by leaving the + * value in r2 for the blend shader to use. */ + if (~reg_2) { + if (instr->src[4].is_ssa) { + emit_explicit_constant(ctx, reg_2, reg_2); + + unsigned out = make_compiler_temp(ctx); + + midgard_instruction ins = v_mov(reg_2, out); + emit_mir_instruction(ctx, ins); + + ctx->blend_src1 = out; + } else { + ctx->blend_src1 = reg_2; + } + } + + emit_fragment_store(ctx, reg, reg_z, reg_s, rt, 0); + } else if (ctx->stage == MESA_SHADER_VERTEX) { + assert(instr->intrinsic == nir_intrinsic_store_output); + + /* We should have been vectorized, though we don't + * currently check that st_vary is emitted only once + * per slot (this is relevant, since there's not a mask + * parameter available on the store [set to 0 by the + * blob]). We do respect the component by adjusting the + * swizzle. If this is a constant source, we'll need to + * emit that explicitly. */ + + emit_explicit_constant(ctx, reg, reg); + + offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]); + + unsigned dst_component = nir_intrinsic_component(instr); + unsigned nr_comp = nir_src_num_components(instr->src[0]); + + /* ABI: Format controlled by the attribute descriptor. + * This simplifies flat shading, although it prevents + * certain (unimplemented) 16-bit optimizations. + * + * In particular, it lets the driver handle internal + * TGSI shaders that set flat in the VS but smooth in + * the FS. This matches our handling on Bifrost. + */ + bool auto32 = true; + assert(nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)) == + 32); + + /* ABI: varyings in the secondary attribute table */ + bool secondary_table = true; + + midgard_instruction st = + m_st_vary_32(reg, PACK_LDST_ATTRIB_OFS(offset)); + st.load_store.arg_reg = REGISTER_LDST_ZERO; + st.load_store.index_reg = REGISTER_LDST_ZERO; + + /* Attribute instruction uses these 2-bits for the + * a32 and table bits, pack this specially. + */ + st.load_store.index_format = + (auto32 ? (1 << 0) : 0) | (secondary_table ? (1 << 1) : 0); + + /* nir_intrinsic_component(store_intr) encodes the + * destination component start. Source component offset + * adjustment is taken care of in + * install_registers_instr(), when offset_swizzle() is + * called. + */ + unsigned src_component = COMPONENT_X; + + assert(nr_comp > 0); + for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) { + st.swizzle[0][i] = src_component; + if (i >= dst_component && i < dst_component + nr_comp - 1) + src_component++; + } + + emit_mir_instruction(ctx, st); + } else { + DBG("Unknown store\n"); + assert(0); + } + + break; + + /* Special case of store_output for lowered blend shaders */ + case nir_intrinsic_store_raw_output_pan: + assert(ctx->stage == MESA_SHADER_FRAGMENT); + reg = nir_src_index(ctx, &instr->src[0]); + for (unsigned s = 0; s < ctx->blend_sample_iterations; s++) + emit_fragment_store(ctx, reg, ~0, ~0, + ctx->inputs->blend.rt + MIDGARD_COLOR_RT0, s); + break; + + case nir_intrinsic_store_global: + case nir_intrinsic_store_shared: + case nir_intrinsic_store_scratch: + reg = nir_src_index(ctx, &instr->src[0]); + emit_explicit_constant(ctx, reg, reg); + + unsigned seg; + if (instr->intrinsic == nir_intrinsic_store_global) + seg = LDST_GLOBAL; + else if (instr->intrinsic == nir_intrinsic_store_shared) + seg = LDST_SHARED; + else + seg = LDST_SCRATCH; + + emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg); + break; + + case nir_intrinsic_load_ssbo_address: + case nir_intrinsic_load_xfb_address: + emit_sysval_read(ctx, &instr->instr, 2, 0); + break; + + case nir_intrinsic_load_first_vertex: + case nir_intrinsic_load_work_dim: + case nir_intrinsic_load_num_vertices: + emit_sysval_read(ctx, &instr->instr, 1, 0); + break; + + case nir_intrinsic_load_base_vertex: + emit_sysval_read(ctx, &instr->instr, 1, 4); + break; + + case nir_intrinsic_load_base_instance: + case nir_intrinsic_get_ssbo_size: + emit_sysval_read(ctx, &instr->instr, 1, 8); + break; + + case nir_intrinsic_load_sample_positions_pan: + emit_sysval_read(ctx, &instr->instr, 2, 0); + break; + + case nir_intrinsic_load_viewport_scale: + case nir_intrinsic_load_viewport_offset: + case nir_intrinsic_load_num_workgroups: + case nir_intrinsic_load_sampler_lod_parameters_pan: + case nir_intrinsic_load_workgroup_size: + emit_sysval_read(ctx, &instr->instr, 3, 0); + break; + + case nir_intrinsic_load_blend_const_color_rgba: + emit_sysval_read(ctx, &instr->instr, 4, 0); + break; + + case nir_intrinsic_load_workgroup_id: + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_global_invocation_id: + case nir_intrinsic_load_global_invocation_id_zero_base: + emit_compute_builtin(ctx, instr); + break; + + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_instance_id: + emit_vertex_builtin(ctx, instr); + break; + + case nir_intrinsic_load_sample_mask_in: + emit_special(ctx, instr, 96); + break; + + case nir_intrinsic_load_sample_id: + emit_special(ctx, instr, 97); + break; + + /* Midgard doesn't seem to want special handling, though we do need to + * take care when scheduling to avoid incorrect reordering. + */ + case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_shared: + case nir_intrinsic_group_memory_barrier: + schedule_barrier(ctx); + break; + + case nir_intrinsic_control_barrier: + schedule_barrier(ctx); + emit_control_barrier(ctx); + schedule_barrier(ctx); + break; + + ATOMIC_CASE(ctx, instr, add, add); + ATOMIC_CASE(ctx, instr, and, and); + ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg); + ATOMIC_CASE(ctx, instr, exchange, xchg); + ATOMIC_CASE(ctx, instr, imax, imax); + ATOMIC_CASE(ctx, instr, imin, imin); + ATOMIC_CASE(ctx, instr, or, or); + ATOMIC_CASE(ctx, instr, umax, umax); + ATOMIC_CASE(ctx, instr, umin, umin); + ATOMIC_CASE(ctx, instr, xor, xor); + + IMAGE_ATOMIC_CASE(ctx, instr, add, add); + IMAGE_ATOMIC_CASE(ctx, instr, and, and); + IMAGE_ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg); + IMAGE_ATOMIC_CASE(ctx, instr, exchange, xchg); + IMAGE_ATOMIC_CASE(ctx, instr, imax, imax); + IMAGE_ATOMIC_CASE(ctx, instr, imin, imin); + IMAGE_ATOMIC_CASE(ctx, instr, or, or); + IMAGE_ATOMIC_CASE(ctx, instr, umax, umax); + IMAGE_ATOMIC_CASE(ctx, instr, umin, umin); + IMAGE_ATOMIC_CASE(ctx, instr, xor, xor); + + default: + fprintf(stderr, "Unhandled intrinsic %s\n", + nir_intrinsic_infos[instr->intrinsic].name); + assert(0); + break; + } } /* Returns dimension with 0 special casing cubemaps */ static unsigned midgard_tex_format(enum glsl_sampler_dim dim) { - switch (dim) { - case GLSL_SAMPLER_DIM_1D: - case GLSL_SAMPLER_DIM_BUF: - return 1; + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_BUF: + return 1; - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_MS: - case GLSL_SAMPLER_DIM_EXTERNAL: - case GLSL_SAMPLER_DIM_RECT: - return 2; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_MS: + case GLSL_SAMPLER_DIM_EXTERNAL: + case GLSL_SAMPLER_DIM_RECT: + return 2; - case GLSL_SAMPLER_DIM_3D: - return 3; + case GLSL_SAMPLER_DIM_3D: + return 3; - case GLSL_SAMPLER_DIM_CUBE: - return 0; + case GLSL_SAMPLER_DIM_CUBE: + return 0; - default: - DBG("Unknown sampler dim type\n"); - assert(0); - return 0; - } + default: + DBG("Unknown sampler dim type\n"); + assert(0); + return 0; + } } /* Tries to attach an explicit LOD or bias as a constant. Returns whether this * was successful */ static bool -pan_attach_constant_bias( - compiler_context *ctx, - nir_src lod, - midgard_texture_word *word) +pan_attach_constant_bias(compiler_context *ctx, nir_src lod, + midgard_texture_word *word) { - /* To attach as constant, it has to *be* constant */ + /* To attach as constant, it has to *be* constant */ - if (!nir_src_is_const(lod)) - return false; + if (!nir_src_is_const(lod)) + return false; - float f = nir_src_as_float(lod); + float f = nir_src_as_float(lod); - /* Break into fixed-point */ - signed lod_int = f; - float lod_frac = f - lod_int; + /* Break into fixed-point */ + signed lod_int = f; + float lod_frac = f - lod_int; - /* Carry over negative fractions */ - if (lod_frac < 0.0) { - lod_int--; - lod_frac += 1.0; - } + /* Carry over negative fractions */ + if (lod_frac < 0.0) { + lod_int--; + lod_frac += 1.0; + } - /* Encode */ - word->bias = float_to_ubyte(lod_frac); - word->bias_int = lod_int; + /* Encode */ + word->bias = float_to_ubyte(lod_frac); + word->bias_int = lod_int; - return true; + return true; } static enum mali_texture_mode mdg_texture_mode(nir_tex_instr *instr) { - if (instr->op == nir_texop_tg4 && instr->is_shadow) - return TEXTURE_GATHER_SHADOW; - else if (instr->op == nir_texop_tg4) - return TEXTURE_GATHER_X + instr->component; - else if (instr->is_shadow) - return TEXTURE_SHADOW; - else - return TEXTURE_NORMAL; + if (instr->op == nir_texop_tg4 && instr->is_shadow) + return TEXTURE_GATHER_SHADOW; + else if (instr->op == nir_texop_tg4) + return TEXTURE_GATHER_X + instr->component; + else if (instr->is_shadow) + return TEXTURE_SHADOW; + else + return TEXTURE_NORMAL; } static void set_tex_coord(compiler_context *ctx, nir_tex_instr *instr, midgard_instruction *ins) { - int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); + int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); - assert(coord_idx >= 0); + assert(coord_idx >= 0); - int comparator_idx = nir_tex_instr_src_index(instr, nir_tex_src_comparator); - int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); - assert(comparator_idx < 0 || ms_idx < 0); - int ms_or_comparator_idx = ms_idx >= 0 ? ms_idx : comparator_idx; + int comparator_idx = nir_tex_instr_src_index(instr, nir_tex_src_comparator); + int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); + assert(comparator_idx < 0 || ms_idx < 0); + int ms_or_comparator_idx = ms_idx >= 0 ? ms_idx : comparator_idx; - unsigned coords = nir_src_index(ctx, &instr->src[coord_idx].src); + unsigned coords = nir_src_index(ctx, &instr->src[coord_idx].src); - emit_explicit_constant(ctx, coords, coords); + emit_explicit_constant(ctx, coords, coords); - ins->src_types[1] = nir_tex_instr_src_type(instr, coord_idx) | - nir_src_bit_size(instr->src[coord_idx].src); + ins->src_types[1] = nir_tex_instr_src_type(instr, coord_idx) | + nir_src_bit_size(instr->src[coord_idx].src); - unsigned nr_comps = instr->coord_components; - unsigned written_mask = 0, write_mask = 0; + unsigned nr_comps = instr->coord_components; + unsigned written_mask = 0, write_mask = 0; - /* Initialize all components to coord.x which is expected to always be - * present. Swizzle is updated below based on the texture dimension - * and extra attributes that are packed in the coordinate argument. - */ - for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) - ins->swizzle[1][c] = COMPONENT_X; + /* Initialize all components to coord.x which is expected to always be + * present. Swizzle is updated below based on the texture dimension + * and extra attributes that are packed in the coordinate argument. + */ + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) + ins->swizzle[1][c] = COMPONENT_X; - /* Shadow ref value is part of the coordinates if there's no comparator - * source, in that case it's always placed in the last component. - * Midgard wants the ref value in coord.z. - */ - if (instr->is_shadow && comparator_idx < 0) { - ins->swizzle[1][COMPONENT_Z] = --nr_comps; - write_mask |= 1 << COMPONENT_Z; - } + /* Shadow ref value is part of the coordinates if there's no comparator + * source, in that case it's always placed in the last component. + * Midgard wants the ref value in coord.z. + */ + if (instr->is_shadow && comparator_idx < 0) { + ins->swizzle[1][COMPONENT_Z] = --nr_comps; + write_mask |= 1 << COMPONENT_Z; + } - /* The array index is the last component if there's no shadow ref value - * or second last if there's one. We already decremented the number of - * components to account for the shadow ref value above. - * Midgard wants the array index in coord.w. - */ - if (instr->is_array) { - ins->swizzle[1][COMPONENT_W] = --nr_comps; - write_mask |= 1 << COMPONENT_W; - } + /* The array index is the last component if there's no shadow ref value + * or second last if there's one. We already decremented the number of + * components to account for the shadow ref value above. + * Midgard wants the array index in coord.w. + */ + if (instr->is_array) { + ins->swizzle[1][COMPONENT_W] = --nr_comps; + write_mask |= 1 << COMPONENT_W; + } - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - /* texelFetch is undefined on samplerCube */ - assert(ins->op != midgard_tex_op_fetch); + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + /* texelFetch is undefined on samplerCube */ + assert(ins->op != midgard_tex_op_fetch); - ins->src[1] = make_compiler_temp_reg(ctx); + ins->src[1] = make_compiler_temp_reg(ctx); - /* For cubemaps, we use a special ld/st op to select the face - * and copy the xy into the texture register - */ - midgard_instruction ld = m_ld_cubemap_coords(ins->src[1], 0); - ld.src[1] = coords; - ld.src_types[1] = ins->src_types[1]; - ld.mask = 0x3; /* xy */ - ld.load_store.bitsize_toggle = true; - ld.swizzle[1][3] = COMPONENT_X; - emit_mir_instruction(ctx, ld); + /* For cubemaps, we use a special ld/st op to select the face + * and copy the xy into the texture register + */ + midgard_instruction ld = m_ld_cubemap_coords(ins->src[1], 0); + ld.src[1] = coords; + ld.src_types[1] = ins->src_types[1]; + ld.mask = 0x3; /* xy */ + ld.load_store.bitsize_toggle = true; + ld.swizzle[1][3] = COMPONENT_X; + emit_mir_instruction(ctx, ld); - /* We packed cube coordiates (X,Y,Z) into (X,Y), update the - * written mask accordingly and decrement the number of - * components - */ - nr_comps--; - written_mask |= 3; - } + /* We packed cube coordiates (X,Y,Z) into (X,Y), update the + * written mask accordingly and decrement the number of + * components + */ + nr_comps--; + written_mask |= 3; + } - /* Now flag tex coord components that have not been written yet */ - write_mask |= mask_of(nr_comps) & ~written_mask; - for (unsigned c = 0; c < nr_comps; c++) - ins->swizzle[1][c] = c; + /* Now flag tex coord components that have not been written yet */ + write_mask |= mask_of(nr_comps) & ~written_mask; + for (unsigned c = 0; c < nr_comps; c++) + ins->swizzle[1][c] = c; - /* Sample index and shadow ref are expected in coord.z */ - if (ms_or_comparator_idx >= 0) { - assert(!((write_mask | written_mask) & (1 << COMPONENT_Z))); + /* Sample index and shadow ref are expected in coord.z */ + if (ms_or_comparator_idx >= 0) { + assert(!((write_mask | written_mask) & (1 << COMPONENT_Z))); - unsigned sample_or_ref = - nir_src_index(ctx, &instr->src[ms_or_comparator_idx].src); + unsigned sample_or_ref = + nir_src_index(ctx, &instr->src[ms_or_comparator_idx].src); - emit_explicit_constant(ctx, sample_or_ref, sample_or_ref); + emit_explicit_constant(ctx, sample_or_ref, sample_or_ref); - if (ins->src[1] == ~0) - ins->src[1] = make_compiler_temp_reg(ctx); + if (ins->src[1] == ~0) + ins->src[1] = make_compiler_temp_reg(ctx); - midgard_instruction mov = v_mov(sample_or_ref, ins->src[1]); + midgard_instruction mov = v_mov(sample_or_ref, ins->src[1]); - for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) - mov.swizzle[1][c] = COMPONENT_X; + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) + mov.swizzle[1][c] = COMPONENT_X; - mov.mask = 1 << COMPONENT_Z; - written_mask |= 1 << COMPONENT_Z; - ins->swizzle[1][COMPONENT_Z] = COMPONENT_Z; - emit_mir_instruction(ctx, mov); - } + mov.mask = 1 << COMPONENT_Z; + written_mask |= 1 << COMPONENT_Z; + ins->swizzle[1][COMPONENT_Z] = COMPONENT_Z; + emit_mir_instruction(ctx, mov); + } - /* Texelfetch coordinates uses all four elements (xyz/index) regardless - * of texture dimensionality, which means it's necessary to zero the - * unused components to keep everything happy. - */ - if (ins->op == midgard_tex_op_fetch && - (written_mask | write_mask) != 0xF) { - if (ins->src[1] == ~0) - ins->src[1] = make_compiler_temp_reg(ctx); + /* Texelfetch coordinates uses all four elements (xyz/index) regardless + * of texture dimensionality, which means it's necessary to zero the + * unused components to keep everything happy. + */ + if (ins->op == midgard_tex_op_fetch && (written_mask | write_mask) != 0xF) { + if (ins->src[1] == ~0) + ins->src[1] = make_compiler_temp_reg(ctx); - /* mov index.zw, #0, or generalized */ - midgard_instruction mov = - v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), ins->src[1]); - mov.has_constants = true; - mov.mask = (written_mask | write_mask) ^ 0xF; - emit_mir_instruction(ctx, mov); - for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) { - if (mov.mask & (1 << c)) - ins->swizzle[1][c] = c; - } - } + /* mov index.zw, #0, or generalized */ + midgard_instruction mov = + v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), ins->src[1]); + mov.has_constants = true; + mov.mask = (written_mask | write_mask) ^ 0xF; + emit_mir_instruction(ctx, mov); + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) { + if (mov.mask & (1 << c)) + ins->swizzle[1][c] = c; + } + } - if (ins->src[1] == ~0) { - /* No temporary reg created, use the src coords directly */ - ins->src[1] = coords; - } else if (write_mask) { - /* Move the remaining coordinates to the temporary reg */ - midgard_instruction mov = v_mov(coords, ins->src[1]); + if (ins->src[1] == ~0) { + /* No temporary reg created, use the src coords directly */ + ins->src[1] = coords; + } else if (write_mask) { + /* Move the remaining coordinates to the temporary reg */ + midgard_instruction mov = v_mov(coords, ins->src[1]); - for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) { - if ((1 << c) & write_mask) { - mov.swizzle[1][c] = ins->swizzle[1][c]; - ins->swizzle[1][c] = c; - } else { - mov.swizzle[1][c] = COMPONENT_X; - } - } + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) { + if ((1 << c) & write_mask) { + mov.swizzle[1][c] = ins->swizzle[1][c]; + ins->swizzle[1][c] = c; + } else { + mov.swizzle[1][c] = COMPONENT_X; + } + } - mov.mask = write_mask; - emit_mir_instruction(ctx, mov); - } + mov.mask = write_mask; + emit_mir_instruction(ctx, mov); + } } static void emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, unsigned midgard_texop) { - nir_dest *dest = &instr->dest; + nir_dest *dest = &instr->dest; - int texture_index = instr->texture_index; - int sampler_index = instr->sampler_index; + int texture_index = instr->texture_index; + int sampler_index = instr->sampler_index; - nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type); + nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type); - /* texture instructions support float outmods */ - unsigned outmod = midgard_outmod_none; - if (dest_base == nir_type_float) { - outmod = mir_determine_float_outmod(ctx, &dest, 0); - } + /* texture instructions support float outmods */ + unsigned outmod = midgard_outmod_none; + if (dest_base == nir_type_float) { + outmod = mir_determine_float_outmod(ctx, &dest, 0); + } - midgard_instruction ins = { - .type = TAG_TEXTURE_4, - .mask = 0xF, - .dest = nir_dest_index(dest), - .src = { ~0, ~0, ~0, ~0 }, - .dest_type = instr->dest_type, - .swizzle = SWIZZLE_IDENTITY_4, - .outmod = outmod, - .op = midgard_texop, - .texture = { - .format = midgard_tex_format(instr->sampler_dim), - .texture_handle = texture_index, - .sampler_handle = sampler_index, - .mode = mdg_texture_mode(instr), - } - }; + midgard_instruction ins = { + .type = TAG_TEXTURE_4, + .mask = 0xF, + .dest = nir_dest_index(dest), + .src = {~0, ~0, ~0, ~0}, + .dest_type = instr->dest_type, + .swizzle = SWIZZLE_IDENTITY_4, + .outmod = outmod, + .op = midgard_texop, + .texture = { + .format = midgard_tex_format(instr->sampler_dim), + .texture_handle = texture_index, + .sampler_handle = sampler_index, + .mode = mdg_texture_mode(instr), + }}; - if (instr->is_shadow && !instr->is_new_style_shadow && instr->op != nir_texop_tg4) - for (int i = 0; i < 4; ++i) - ins.swizzle[0][i] = COMPONENT_X; + if (instr->is_shadow && !instr->is_new_style_shadow && + instr->op != nir_texop_tg4) + for (int i = 0; i < 4; ++i) + ins.swizzle[0][i] = COMPONENT_X; - for (unsigned i = 0; i < instr->num_srcs; ++i) { - int index = nir_src_index(ctx, &instr->src[i].src); - unsigned sz = nir_src_bit_size(instr->src[i].src); - nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz; + for (unsigned i = 0; i < instr->num_srcs; ++i) { + int index = nir_src_index(ctx, &instr->src[i].src); + unsigned sz = nir_src_bit_size(instr->src[i].src); + nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz; - switch (instr->src[i].src_type) { - case nir_tex_src_coord: - set_tex_coord(ctx, instr, &ins); - break; + switch (instr->src[i].src_type) { + case nir_tex_src_coord: + set_tex_coord(ctx, instr, &ins); + break; - case nir_tex_src_bias: - case nir_tex_src_lod: { - /* Try as a constant if we can */ + case nir_tex_src_bias: + case nir_tex_src_lod: { + /* Try as a constant if we can */ - bool is_txf = midgard_texop == midgard_tex_op_fetch; - if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture)) - break; + bool is_txf = midgard_texop == midgard_tex_op_fetch; + if (!is_txf && + pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture)) + break; - ins.texture.lod_register = true; - ins.src[2] = index; - ins.src_types[2] = T; + ins.texture.lod_register = true; + ins.src[2] = index; + ins.src_types[2] = T; - for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) - ins.swizzle[2][c] = COMPONENT_X; + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) + ins.swizzle[2][c] = COMPONENT_X; - emit_explicit_constant(ctx, index, index); + emit_explicit_constant(ctx, index, index); - break; - }; + break; + }; - case nir_tex_src_offset: { - ins.texture.offset_register = true; - ins.src[3] = index; - ins.src_types[3] = T; + case nir_tex_src_offset: { + ins.texture.offset_register = true; + ins.src[3] = index; + ins.src_types[3] = T; - for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) - ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c; + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) + ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c; - emit_explicit_constant(ctx, index, index); - break; - }; + emit_explicit_constant(ctx, index, index); + break; + }; - case nir_tex_src_comparator: - case nir_tex_src_ms_index: - /* Nothing to do, handled in set_tex_coord() */ - break; + case nir_tex_src_comparator: + case nir_tex_src_ms_index: + /* Nothing to do, handled in set_tex_coord() */ + break; - default: { - fprintf(stderr, "Unknown texture source type: %d\n", instr->src[i].src_type); - assert(0); - } - } - } + default: { + fprintf(stderr, "Unknown texture source type: %d\n", + instr->src[i].src_type); + assert(0); + } + } + } - emit_mir_instruction(ctx, ins); + emit_mir_instruction(ctx, ins); } static void emit_tex(compiler_context *ctx, nir_tex_instr *instr) { - switch (instr->op) { - case nir_texop_tex: - case nir_texop_txb: - emit_texop_native(ctx, instr, midgard_tex_op_normal); - break; - case nir_texop_txl: - case nir_texop_tg4: - emit_texop_native(ctx, instr, midgard_tex_op_gradient); - break; - case nir_texop_txf: - case nir_texop_txf_ms: - emit_texop_native(ctx, instr, midgard_tex_op_fetch); - break; - case nir_texop_txs: - emit_sysval_read(ctx, &instr->instr, 4, 0); - break; - default: { - fprintf(stderr, "Unhandled texture op: %d\n", instr->op); - assert(0); - } - } + switch (instr->op) { + case nir_texop_tex: + case nir_texop_txb: + emit_texop_native(ctx, instr, midgard_tex_op_normal); + break; + case nir_texop_txl: + case nir_texop_tg4: + emit_texop_native(ctx, instr, midgard_tex_op_gradient); + break; + case nir_texop_txf: + case nir_texop_txf_ms: + emit_texop_native(ctx, instr, midgard_tex_op_fetch); + break; + case nir_texop_txs: + emit_sysval_read(ctx, &instr->instr, 4, 0); + break; + default: { + fprintf(stderr, "Unhandled texture op: %d\n", instr->op); + assert(0); + } + } } static void emit_jump(compiler_context *ctx, nir_jump_instr *instr) { - switch (instr->type) { - case nir_jump_break: { - /* Emit a branch out of the loop */ - struct midgard_instruction br = v_branch(false, false); - br.branch.target_type = TARGET_BREAK; - br.branch.target_break = ctx->current_loop_depth; - emit_mir_instruction(ctx, br); - break; - } + switch (instr->type) { + case nir_jump_break: { + /* Emit a branch out of the loop */ + struct midgard_instruction br = v_branch(false, false); + br.branch.target_type = TARGET_BREAK; + br.branch.target_break = ctx->current_loop_depth; + emit_mir_instruction(ctx, br); + break; + } - default: - unreachable("Unhandled jump"); - } + default: + unreachable("Unhandled jump"); + } } static void emit_instr(compiler_context *ctx, struct nir_instr *instr) { - switch (instr->type) { - case nir_instr_type_load_const: - emit_load_const(ctx, nir_instr_as_load_const(instr)); - break; + switch (instr->type) { + case nir_instr_type_load_const: + emit_load_const(ctx, nir_instr_as_load_const(instr)); + break; - case nir_instr_type_intrinsic: - emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); - break; + case nir_instr_type_intrinsic: + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; - case nir_instr_type_alu: - emit_alu(ctx, nir_instr_as_alu(instr)); - break; + case nir_instr_type_alu: + emit_alu(ctx, nir_instr_as_alu(instr)); + break; - case nir_instr_type_tex: - emit_tex(ctx, nir_instr_as_tex(instr)); - break; + case nir_instr_type_tex: + emit_tex(ctx, nir_instr_as_tex(instr)); + break; - case nir_instr_type_jump: - emit_jump(ctx, nir_instr_as_jump(instr)); - break; + case nir_instr_type_jump: + emit_jump(ctx, nir_instr_as_jump(instr)); + break; - case nir_instr_type_ssa_undef: - /* Spurious */ - break; + case nir_instr_type_ssa_undef: + /* Spurious */ + break; - default: - DBG("Unhandled instruction type\n"); - break; - } + default: + DBG("Unhandled instruction type\n"); + break; + } } - /* ALU instructions can inline or embed constants, which decreases register * pressure and saves space. */ -#define CONDITIONAL_ATTACH(idx) { \ - void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \ -\ - if (entry) { \ - attach_constants(ctx, alu, entry, alu->src[idx] + 1); \ - alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \ - } \ -} +#define CONDITIONAL_ATTACH(idx) \ + { \ + void *entry = \ + _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \ + \ + if (entry) { \ + attach_constants(ctx, alu, entry, alu->src[idx] + 1); \ + alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \ + } \ + } static void inline_alu_constants(compiler_context *ctx, midgard_block *block) { - mir_foreach_instr_in_block(block, alu) { - /* Other instructions cannot inline constants */ - if (alu->type != TAG_ALU_4) continue; - if (alu->compact_branch) continue; + mir_foreach_instr_in_block(block, alu) { + /* Other instructions cannot inline constants */ + if (alu->type != TAG_ALU_4) + continue; + if (alu->compact_branch) + continue; - /* If there is already a constant here, we can do nothing */ - if (alu->has_constants) continue; + /* If there is already a constant here, we can do nothing */ + if (alu->has_constants) + continue; - CONDITIONAL_ATTACH(0); + CONDITIONAL_ATTACH(0); - if (!alu->has_constants) { - CONDITIONAL_ATTACH(1) - } else if (!alu->inline_constant) { - /* Corner case: _two_ vec4 constants, for instance with a - * csel. For this case, we can only use a constant - * register for one, we'll have to emit a move for the - * other. */ + if (!alu->has_constants) { + CONDITIONAL_ATTACH(1) + } else if (!alu->inline_constant) { + /* Corner case: _two_ vec4 constants, for instance with a + * csel. For this case, we can only use a constant + * register for one, we'll have to emit a move for the + * other. */ - void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1); - unsigned scratch = make_compiler_temp(ctx); + void *entry = + _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1); + unsigned scratch = make_compiler_temp(ctx); - if (entry) { - midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch); - attach_constants(ctx, &ins, entry, alu->src[1] + 1); + if (entry) { + midgard_instruction ins = + v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch); + attach_constants(ctx, &ins, entry, alu->src[1] + 1); - /* Set the source */ - alu->src[1] = scratch; + /* Set the source */ + alu->src[1] = scratch; - /* Inject us -before- the last instruction which set r31 */ - mir_insert_instruction_before(ctx, mir_prev_op(alu), ins); - } - } - } + /* Inject us -before- the last instruction which set r31 */ + mir_insert_instruction_before(ctx, mir_prev_op(alu), ins); + } + } + } } unsigned max_bitsize_for_alu(midgard_instruction *ins) { - unsigned max_bitsize = 0; - for (int i = 0; i < MIR_SRC_COUNT; i++) { - if (ins->src[i] == ~0) continue; - unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]); - max_bitsize = MAX2(src_bitsize, max_bitsize); - } - unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type); - max_bitsize = MAX2(dst_bitsize, max_bitsize); + unsigned max_bitsize = 0; + for (int i = 0; i < MIR_SRC_COUNT; i++) { + if (ins->src[i] == ~0) + continue; + unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]); + max_bitsize = MAX2(src_bitsize, max_bitsize); + } + unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type); + max_bitsize = MAX2(dst_bitsize, max_bitsize); - /* We emulate 8-bit as 16-bit for simplicity of packing */ - max_bitsize = MAX2(max_bitsize, 16); + /* We emulate 8-bit as 16-bit for simplicity of packing */ + max_bitsize = MAX2(max_bitsize, 16); - /* We don't have fp16 LUTs, so we'll want to emit code like: - * - * vlut.fsinr hr0, hr0 - * - * where both input and output are 16-bit but the operation is carried - * out in 32-bit - */ + /* We don't have fp16 LUTs, so we'll want to emit code like: + * + * vlut.fsinr hr0, hr0 + * + * where both input and output are 16-bit but the operation is carried + * out in 32-bit + */ - switch (ins->op) { - case midgard_alu_op_fsqrt: - case midgard_alu_op_frcp: - case midgard_alu_op_frsqrt: - case midgard_alu_op_fsinpi: - case midgard_alu_op_fcospi: - case midgard_alu_op_fexp2: - case midgard_alu_op_flog2: - max_bitsize = MAX2(max_bitsize, 32); - break; + switch (ins->op) { + case midgard_alu_op_fsqrt: + case midgard_alu_op_frcp: + case midgard_alu_op_frsqrt: + case midgard_alu_op_fsinpi: + case midgard_alu_op_fcospi: + case midgard_alu_op_fexp2: + case midgard_alu_op_flog2: + max_bitsize = MAX2(max_bitsize, 32); + break; - default: - break; - } + default: + break; + } - /* High implies computing at a higher bitsize, e.g umul_high of 32-bit - * requires computing at 64-bit */ - if (midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_keephi) { - max_bitsize *= 2; - assert(max_bitsize <= 64); - } + /* High implies computing at a higher bitsize, e.g umul_high of 32-bit + * requires computing at 64-bit */ + if (midgard_is_integer_out_op(ins->op) && + ins->outmod == midgard_outmod_keephi) { + max_bitsize *= 2; + assert(max_bitsize <= 64); + } - return max_bitsize; + return max_bitsize; } midgard_reg_mode reg_mode_for_bitsize(unsigned bitsize) { - switch (bitsize) { - /* use 16 pipe for 8 since we don't support vec16 yet */ - case 8: - case 16: - return midgard_reg_mode_16; - case 32: - return midgard_reg_mode_32; - case 64: - return midgard_reg_mode_64; - default: - unreachable("invalid bit size"); - } + switch (bitsize) { + /* use 16 pipe for 8 since we don't support vec16 yet */ + case 8: + case 16: + return midgard_reg_mode_16; + case 32: + return midgard_reg_mode_32; + case 64: + return midgard_reg_mode_64; + default: + unreachable("invalid bit size"); + } } /* Midgard supports two types of constants, embedded constants (128-bit) and @@ -2710,102 +2727,103 @@ reg_mode_for_bitsize(unsigned bitsize) static void embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) { - mir_foreach_instr_in_block(block, ins) { - if (!ins->has_constants) continue; - if (ins->has_inline_constant) continue; + mir_foreach_instr_in_block(block, ins) { + if (!ins->has_constants) + continue; + if (ins->has_inline_constant) + continue; - unsigned max_bitsize = max_bitsize_for_alu(ins); + unsigned max_bitsize = max_bitsize_for_alu(ins); - /* We can inline 32-bit (sometimes) or 16-bit (usually) */ - bool is_16 = max_bitsize == 16; - bool is_32 = max_bitsize == 32; + /* We can inline 32-bit (sometimes) or 16-bit (usually) */ + bool is_16 = max_bitsize == 16; + bool is_32 = max_bitsize == 32; - if (!(is_16 || is_32)) - continue; + if (!(is_16 || is_32)) + continue; - /* src1 cannot be an inline constant due to encoding - * restrictions. So, if possible we try to flip the arguments - * in that case */ + /* src1 cannot be an inline constant due to encoding + * restrictions. So, if possible we try to flip the arguments + * in that case */ - int op = ins->op; + int op = ins->op; - if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) && - alu_opcode_props[op].props & OP_COMMUTES) { - mir_flip(ins); - } + if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) && + alu_opcode_props[op].props & OP_COMMUTES) { + mir_flip(ins); + } - if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { - /* Component is from the swizzle. Take a nonzero component */ - assert(ins->mask); - unsigned first_comp = ffs(ins->mask) - 1; - unsigned component = ins->swizzle[1][first_comp]; + if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { + /* Component is from the swizzle. Take a nonzero component */ + assert(ins->mask); + unsigned first_comp = ffs(ins->mask) - 1; + unsigned component = ins->swizzle[1][first_comp]; - /* Scale constant appropriately, if we can legally */ - int16_t scaled_constant = 0; + /* Scale constant appropriately, if we can legally */ + int16_t scaled_constant = 0; - if (is_16) { - scaled_constant = ins->constants.u16[component]; - } else if (midgard_is_integer_op(op)) { - scaled_constant = ins->constants.u32[component]; + if (is_16) { + scaled_constant = ins->constants.u16[component]; + } else if (midgard_is_integer_op(op)) { + scaled_constant = ins->constants.u32[component]; - /* Constant overflow after resize */ - if (scaled_constant != ins->constants.u32[component]) - continue; - } else { - float original = ins->constants.f32[component]; - scaled_constant = _mesa_float_to_half(original); + /* Constant overflow after resize */ + if (scaled_constant != ins->constants.u32[component]) + continue; + } else { + float original = ins->constants.f32[component]; + scaled_constant = _mesa_float_to_half(original); - /* Check for loss of precision. If this is - * mediump, we don't care, but for a highp - * shader, we need to pay attention. NIR - * doesn't yet tell us which mode we're in! - * Practically this prevents most constants - * from being inlined, sadly. */ + /* Check for loss of precision. If this is + * mediump, we don't care, but for a highp + * shader, we need to pay attention. NIR + * doesn't yet tell us which mode we're in! + * Practically this prevents most constants + * from being inlined, sadly. */ - float fp32 = _mesa_half_to_float(scaled_constant); + float fp32 = _mesa_half_to_float(scaled_constant); - if (fp32 != original) - continue; - } + if (fp32 != original) + continue; + } - /* Should've been const folded */ - if (ins->src_abs[1] || ins->src_neg[1]) - continue; + /* Should've been const folded */ + if (ins->src_abs[1] || ins->src_neg[1]) + continue; - /* Make sure that the constant is not itself a vector - * by checking if all accessed values are the same. */ + /* Make sure that the constant is not itself a vector + * by checking if all accessed values are the same. */ - const midgard_constants *cons = &ins->constants; - uint32_t value = is_16 ? cons->u16[component] : cons->u32[component]; + const midgard_constants *cons = &ins->constants; + uint32_t value = is_16 ? cons->u16[component] : cons->u32[component]; - bool is_vector = false; - unsigned mask = effective_writemask(ins->op, ins->mask); + bool is_vector = false; + unsigned mask = effective_writemask(ins->op, ins->mask); - for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { - /* We only care if this component is actually used */ - if (!(mask & (1 << c))) - continue; + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { + /* We only care if this component is actually used */ + if (!(mask & (1 << c))) + continue; - uint32_t test = is_16 ? - cons->u16[ins->swizzle[1][c]] : - cons->u32[ins->swizzle[1][c]]; + uint32_t test = is_16 ? cons->u16[ins->swizzle[1][c]] + : cons->u32[ins->swizzle[1][c]]; - if (test != value) { - is_vector = true; - break; - } - } + if (test != value) { + is_vector = true; + break; + } + } - if (is_vector) - continue; + if (is_vector) + continue; - /* Get rid of the embedded constant */ - ins->has_constants = false; - ins->src[1] = ~0; - ins->has_inline_constant = true; - ins->inline_constant = scaled_constant; - } - } + /* Get rid of the embedded constant */ + ins->has_constants = false; + ins->src[1] = ~0; + ins->has_inline_constant = true; + ins->inline_constant = scaled_constant; + } + } } /* Dead code elimination for branches at the end of a block - only one branch @@ -2814,16 +2832,17 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) static void midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block) { - bool branched = false; + bool branched = false; - mir_foreach_instr_in_block_safe(block, ins) { - if (!midgard_is_branch_unit(ins->unit)) continue; + mir_foreach_instr_in_block_safe(block, ins) { + if (!midgard_is_branch_unit(ins->unit)) + continue; - if (branched) - mir_remove_instruction(ins); + if (branched) + mir_remove_instruction(ins); - branched = true; - } + branched = true; + } } /* We want to force the invert on AND/OR to the second slot to legalize into @@ -2836,227 +2855,235 @@ midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block) static void midgard_legalize_invert(compiler_context *ctx, midgard_block *block) { - mir_foreach_instr_in_block(block, ins) { - if (ins->type != TAG_ALU_4) continue; + mir_foreach_instr_in_block(block, ins) { + if (ins->type != TAG_ALU_4) + continue; - if (ins->op != midgard_alu_op_iand && - ins->op != midgard_alu_op_ior) continue; + if (ins->op != midgard_alu_op_iand && ins->op != midgard_alu_op_ior) + continue; - if (ins->src_invert[1] || !ins->src_invert[0]) continue; + if (ins->src_invert[1] || !ins->src_invert[0]) + continue; - if (ins->has_inline_constant) { - /* ~(#~a) = ~(~#a) = a, so valid, and forces both - * inverts on */ - ins->inline_constant = ~ins->inline_constant; - ins->src_invert[1] = true; - } else { - /* Flip to the right invert order. Note - * has_inline_constant false by assumption on the - * branch, so flipping makes sense. */ - mir_flip(ins); - } - } + if (ins->has_inline_constant) { + /* ~(#~a) = ~(~#a) = a, so valid, and forces both + * inverts on */ + ins->inline_constant = ~ins->inline_constant; + ins->src_invert[1] = true; + } else { + /* Flip to the right invert order. Note + * has_inline_constant false by assumption on the + * branch, so flipping makes sense. */ + mir_flip(ins); + } + } } static unsigned emit_fragment_epilogue(compiler_context *ctx, unsigned rt, unsigned sample_iter) { - /* Loop to ourselves */ - midgard_instruction *br = ctx->writeout_branch[rt][sample_iter]; - struct midgard_instruction ins = v_branch(false, false); - ins.writeout = br->writeout; - ins.branch.target_block = ctx->block_count - 1; - ins.constants.u32[0] = br->constants.u32[0]; - memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types)); - emit_mir_instruction(ctx, ins); + /* Loop to ourselves */ + midgard_instruction *br = ctx->writeout_branch[rt][sample_iter]; + struct midgard_instruction ins = v_branch(false, false); + ins.writeout = br->writeout; + ins.branch.target_block = ctx->block_count - 1; + ins.constants.u32[0] = br->constants.u32[0]; + memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types)); + emit_mir_instruction(ctx, ins); - ctx->current_block->epilogue = true; - schedule_barrier(ctx); - return ins.branch.target_block; + ctx->current_block->epilogue = true; + schedule_barrier(ctx); + return ins.branch.target_block; } static midgard_block * emit_block_init(compiler_context *ctx) { - midgard_block *this_block = ctx->after_block; - ctx->after_block = NULL; + midgard_block *this_block = ctx->after_block; + ctx->after_block = NULL; - if (!this_block) - this_block = create_empty_block(ctx); + if (!this_block) + this_block = create_empty_block(ctx); - list_addtail(&this_block->base.link, &ctx->blocks); + list_addtail(&this_block->base.link, &ctx->blocks); - this_block->scheduled = false; - ++ctx->block_count; + this_block->scheduled = false; + ++ctx->block_count; - /* Set up current block */ - list_inithead(&this_block->base.instructions); - ctx->current_block = this_block; + /* Set up current block */ + list_inithead(&this_block->base.instructions); + ctx->current_block = this_block; - return this_block; + return this_block; } static midgard_block * emit_block(compiler_context *ctx, nir_block *block) { - midgard_block *this_block = emit_block_init(ctx); + midgard_block *this_block = emit_block_init(ctx); - nir_foreach_instr(instr, block) { - emit_instr(ctx, instr); - ++ctx->instruction_count; - } + nir_foreach_instr(instr, block) { + emit_instr(ctx, instr); + ++ctx->instruction_count; + } - return this_block; + return this_block; } -static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list); +static midgard_block *emit_cf_list(struct compiler_context *ctx, + struct exec_list *list); static void emit_if(struct compiler_context *ctx, nir_if *nif) { - midgard_block *before_block = ctx->current_block; + midgard_block *before_block = ctx->current_block; - /* Speculatively emit the branch, but we can't fill it in until later */ - bool inv = false; - EMIT(branch, true, true); - midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); - then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv); - then_branch->src_types[0] = nir_type_uint32; - then_branch->branch.invert_conditional = !inv; + /* Speculatively emit the branch, but we can't fill it in until later */ + bool inv = false; + EMIT(branch, true, true); + midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); + then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv); + then_branch->src_types[0] = nir_type_uint32; + then_branch->branch.invert_conditional = !inv; - /* Emit the two subblocks. */ - midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); - midgard_block *end_then_block = ctx->current_block; + /* Emit the two subblocks. */ + midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); + midgard_block *end_then_block = ctx->current_block; - /* Emit a jump from the end of the then block to the end of the else */ - EMIT(branch, false, false); - midgard_instruction *then_exit = mir_last_in_block(ctx->current_block); + /* Emit a jump from the end of the then block to the end of the else */ + EMIT(branch, false, false); + midgard_instruction *then_exit = mir_last_in_block(ctx->current_block); - /* Emit second block, and check if it's empty */ + /* Emit second block, and check if it's empty */ - int else_idx = ctx->block_count; - int count_in = ctx->instruction_count; - midgard_block *else_block = emit_cf_list(ctx, &nif->else_list); - midgard_block *end_else_block = ctx->current_block; - int after_else_idx = ctx->block_count; + int else_idx = ctx->block_count; + int count_in = ctx->instruction_count; + midgard_block *else_block = emit_cf_list(ctx, &nif->else_list); + midgard_block *end_else_block = ctx->current_block; + int after_else_idx = ctx->block_count; - /* Now that we have the subblocks emitted, fix up the branches */ + /* Now that we have the subblocks emitted, fix up the branches */ - assert(then_block); - assert(else_block); + assert(then_block); + assert(else_block); - if (ctx->instruction_count == count_in) { - /* The else block is empty, so don't emit an exit jump */ - mir_remove_instruction(then_exit); - then_branch->branch.target_block = after_else_idx; - } else { - then_branch->branch.target_block = else_idx; - then_exit->branch.target_block = after_else_idx; - } + if (ctx->instruction_count == count_in) { + /* The else block is empty, so don't emit an exit jump */ + mir_remove_instruction(then_exit); + then_branch->branch.target_block = after_else_idx; + } else { + then_branch->branch.target_block = else_idx; + then_exit->branch.target_block = after_else_idx; + } - /* Wire up the successors */ + /* Wire up the successors */ - ctx->after_block = create_empty_block(ctx); + ctx->after_block = create_empty_block(ctx); - pan_block_add_successor(&before_block->base, &then_block->base); - pan_block_add_successor(&before_block->base, &else_block->base); + pan_block_add_successor(&before_block->base, &then_block->base); + pan_block_add_successor(&before_block->base, &else_block->base); - pan_block_add_successor(&end_then_block->base, &ctx->after_block->base); - pan_block_add_successor(&end_else_block->base, &ctx->after_block->base); + pan_block_add_successor(&end_then_block->base, &ctx->after_block->base); + pan_block_add_successor(&end_else_block->base, &ctx->after_block->base); } static void emit_loop(struct compiler_context *ctx, nir_loop *nloop) { - /* Remember where we are */ - midgard_block *start_block = ctx->current_block; + /* Remember where we are */ + midgard_block *start_block = ctx->current_block; - /* Allocate a loop number, growing the current inner loop depth */ - int loop_idx = ++ctx->current_loop_depth; + /* Allocate a loop number, growing the current inner loop depth */ + int loop_idx = ++ctx->current_loop_depth; - /* Get index from before the body so we can loop back later */ - int start_idx = ctx->block_count; + /* Get index from before the body so we can loop back later */ + int start_idx = ctx->block_count; - /* Emit the body itself */ - midgard_block *loop_block = emit_cf_list(ctx, &nloop->body); + /* Emit the body itself */ + midgard_block *loop_block = emit_cf_list(ctx, &nloop->body); - /* Branch back to loop back */ - struct midgard_instruction br_back = v_branch(false, false); - br_back.branch.target_block = start_idx; - emit_mir_instruction(ctx, br_back); + /* Branch back to loop back */ + struct midgard_instruction br_back = v_branch(false, false); + br_back.branch.target_block = start_idx; + emit_mir_instruction(ctx, br_back); - /* Mark down that branch in the graph. */ - pan_block_add_successor(&start_block->base, &loop_block->base); - pan_block_add_successor(&ctx->current_block->base, &loop_block->base); + /* Mark down that branch in the graph. */ + pan_block_add_successor(&start_block->base, &loop_block->base); + pan_block_add_successor(&ctx->current_block->base, &loop_block->base); - /* Find the index of the block about to follow us (note: we don't add - * one; blocks are 0-indexed so we get a fencepost problem) */ - int break_block_idx = ctx->block_count; + /* Find the index of the block about to follow us (note: we don't add + * one; blocks are 0-indexed so we get a fencepost problem) */ + int break_block_idx = ctx->block_count; - /* Fix up the break statements we emitted to point to the right place, - * now that we can allocate a block number for them */ - ctx->after_block = create_empty_block(ctx); + /* Fix up the break statements we emitted to point to the right place, + * now that we can allocate a block number for them */ + ctx->after_block = create_empty_block(ctx); - mir_foreach_block_from(ctx, start_block, _block) { - mir_foreach_instr_in_block(((midgard_block *) _block), ins) { - if (ins->type != TAG_ALU_4) continue; - if (!ins->compact_branch) continue; + mir_foreach_block_from(ctx, start_block, _block) { + mir_foreach_instr_in_block(((midgard_block *)_block), ins) { + if (ins->type != TAG_ALU_4) + continue; + if (!ins->compact_branch) + continue; - /* We found a branch -- check the type to see if we need to do anything */ - if (ins->branch.target_type != TARGET_BREAK) continue; + /* We found a branch -- check the type to see if we need to do anything + */ + if (ins->branch.target_type != TARGET_BREAK) + continue; - /* It's a break! Check if it's our break */ - if (ins->branch.target_break != loop_idx) continue; + /* It's a break! Check if it's our break */ + if (ins->branch.target_break != loop_idx) + continue; - /* Okay, cool, we're breaking out of this loop. - * Rewrite from a break to a goto */ + /* Okay, cool, we're breaking out of this loop. + * Rewrite from a break to a goto */ - ins->branch.target_type = TARGET_GOTO; - ins->branch.target_block = break_block_idx; + ins->branch.target_type = TARGET_GOTO; + ins->branch.target_block = break_block_idx; - pan_block_add_successor(_block, &ctx->after_block->base); - } - } + pan_block_add_successor(_block, &ctx->after_block->base); + } + } - /* Now that we've finished emitting the loop, free up the depth again - * so we play nice with recursion amid nested loops */ - --ctx->current_loop_depth; + /* Now that we've finished emitting the loop, free up the depth again + * so we play nice with recursion amid nested loops */ + --ctx->current_loop_depth; - /* Dump loop stats */ - ++ctx->loop_count; + /* Dump loop stats */ + ++ctx->loop_count; } static midgard_block * emit_cf_list(struct compiler_context *ctx, struct exec_list *list) { - midgard_block *start_block = NULL; + midgard_block *start_block = NULL; - foreach_list_typed(nir_cf_node, node, node, list) { - switch (node->type) { - case nir_cf_node_block: { - midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node)); + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: { + midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node)); - if (!start_block) - start_block = block; + if (!start_block) + start_block = block; - break; - } + break; + } - case nir_cf_node_if: - emit_if(ctx, nir_cf_node_as_if(node)); - break; + case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; - case nir_cf_node_loop: - emit_loop(ctx, nir_cf_node_as_loop(node)); - break; + case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; - case nir_cf_node_function: - assert(0); - break; - } - } + case nir_cf_node_function: + assert(0); + break; + } + } - return start_block; + return start_block; } /* Due to lookahead, we need to report the first tag executed in the command @@ -3066,22 +3093,22 @@ emit_cf_list(struct compiler_context *ctx, struct exec_list *list) unsigned midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx) { - midgard_block *initial_block = mir_get_block(ctx, block_idx); + midgard_block *initial_block = mir_get_block(ctx, block_idx); - mir_foreach_block_from(ctx, initial_block, _v) { - midgard_block *v = (midgard_block *) _v; - if (v->quadword_count) { - midgard_bundle *initial_bundle = - util_dynarray_element(&v->bundles, midgard_bundle, 0); + mir_foreach_block_from(ctx, initial_block, _v) { + midgard_block *v = (midgard_block *)_v; + if (v->quadword_count) { + midgard_bundle *initial_bundle = + util_dynarray_element(&v->bundles, midgard_bundle, 0); - return initial_bundle->tag; - } - } + return initial_bundle->tag; + } + } - /* Default to a tag 1 which will break from the shader, in case we jump - * to the exit block (i.e. `return` in a compute shader) */ + /* Default to a tag 1 which will break from the shader, in case we jump + * to the exit block (i.e. `return` in a compute shader) */ - return 1; + return 1; } /* For each fragment writeout instruction, generate a writeout loop to @@ -3090,41 +3117,42 @@ midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx) static void mir_add_writeout_loops(compiler_context *ctx) { - for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) { - for (unsigned s = 0; s < MIDGARD_MAX_SAMPLE_ITER; ++s) { - midgard_instruction *br = ctx->writeout_branch[rt][s]; - if (!br) continue; + for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) { + for (unsigned s = 0; s < MIDGARD_MAX_SAMPLE_ITER; ++s) { + midgard_instruction *br = ctx->writeout_branch[rt][s]; + if (!br) + continue; - unsigned popped = br->branch.target_block; - pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base), - &ctx->current_block->base); - br->branch.target_block = emit_fragment_epilogue(ctx, rt, s); - br->branch.target_type = TARGET_GOTO; + unsigned popped = br->branch.target_block; + pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base), + &ctx->current_block->base); + br->branch.target_block = emit_fragment_epilogue(ctx, rt, s); + br->branch.target_type = TARGET_GOTO; - /* If we have more RTs, we'll need to restore back after our - * loop terminates */ - midgard_instruction *next_br = NULL; + /* If we have more RTs, we'll need to restore back after our + * loop terminates */ + midgard_instruction *next_br = NULL; - if ((s + 1) < MIDGARD_MAX_SAMPLE_ITER) - next_br = ctx->writeout_branch[rt][s + 1]; + if ((s + 1) < MIDGARD_MAX_SAMPLE_ITER) + next_br = ctx->writeout_branch[rt][s + 1]; - if (!next_br && (rt + 1) < ARRAY_SIZE(ctx->writeout_branch)) - next_br = ctx->writeout_branch[rt + 1][0]; + if (!next_br && (rt + 1) < ARRAY_SIZE(ctx->writeout_branch)) + next_br = ctx->writeout_branch[rt + 1][0]; - if (next_br) { - midgard_instruction uncond = v_branch(false, false); - uncond.branch.target_block = popped; - uncond.branch.target_type = TARGET_GOTO; - emit_mir_instruction(ctx, uncond); - pan_block_add_successor(&ctx->current_block->base, - &(mir_get_block(ctx, popped)->base)); - schedule_barrier(ctx); - } else { - /* We're last, so we can terminate here */ - br->last_writeout = true; - } - } - } + if (next_br) { + midgard_instruction uncond = v_branch(false, false); + uncond.branch.target_block = popped; + uncond.branch.target_type = TARGET_GOTO; + emit_mir_instruction(ctx, uncond); + pan_block_add_successor(&ctx->current_block->base, + &(mir_get_block(ctx, popped)->base)); + schedule_barrier(ctx); + } else { + /* We're last, so we can terminate here */ + br->last_writeout = true; + } + } + } } void @@ -3133,281 +3161,279 @@ midgard_compile_shader_nir(nir_shader *nir, struct util_dynarray *binary, struct pan_shader_info *info) { - midgard_debug = debug_get_option_midgard_debug(); + midgard_debug = debug_get_option_midgard_debug(); - /* TODO: Bound against what? */ - compiler_context *ctx = rzalloc(NULL, compiler_context); - ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, - inputs->fixed_sysval_layout, - ctx); + /* TODO: Bound against what? */ + compiler_context *ctx = rzalloc(NULL, compiler_context); + ctx->sysval_to_id = + panfrost_init_sysvals(&info->sysvals, inputs->fixed_sysval_layout, ctx); - ctx->inputs = inputs; - ctx->nir = nir; - ctx->info = info; - ctx->stage = nir->info.stage; + ctx->inputs = inputs; + ctx->nir = nir; + ctx->info = info; + ctx->stage = nir->info.stage; - if (inputs->is_blend) { - unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1); - const struct util_format_description *desc = - util_format_description(inputs->rt_formats[inputs->blend.rt]); + if (inputs->is_blend) { + unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1); + const struct util_format_description *desc = + util_format_description(inputs->rt_formats[inputs->blend.rt]); - /* We have to split writeout in 128 bit chunks */ - ctx->blend_sample_iterations = - DIV_ROUND_UP(desc->block.bits * nr_samples, 128); - } - ctx->blend_input = ~0; - ctx->blend_src1 = ~0; - ctx->quirks = midgard_get_quirks(inputs->gpu_id); + /* We have to split writeout in 128 bit chunks */ + ctx->blend_sample_iterations = + DIV_ROUND_UP(desc->block.bits * nr_samples, 128); + } + ctx->blend_input = ~0; + ctx->blend_src1 = ~0; + ctx->quirks = midgard_get_quirks(inputs->gpu_id); - /* Initialize at a global (not block) level hash tables */ + /* Initialize at a global (not block) level hash tables */ - ctx->ssa_constants = _mesa_hash_table_u64_create(ctx); + ctx->ssa_constants = _mesa_hash_table_u64_create(ctx); - /* Lower gl_Position pre-optimisation, but after lowering vars to ssa - * (so we don't accidentally duplicate the epilogue since mesa/st has - * messed with our I/O quite a bit already) */ + /* Lower gl_Position pre-optimisation, but after lowering vars to ssa + * (so we don't accidentally duplicate the epilogue since mesa/st has + * messed with our I/O quite a bit already) */ - NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); - if (ctx->stage == MESA_SHADER_VERTEX) { - NIR_PASS_V(nir, nir_lower_viewport_transform); - NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0); - } + if (ctx->stage == MESA_SHADER_VERTEX) { + NIR_PASS_V(nir, nir_lower_viewport_transform); + NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0); + } - NIR_PASS_V(nir, nir_lower_var_copies); - NIR_PASS_V(nir, nir_lower_vars_to_ssa); - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_lower_var_copies); - NIR_PASS_V(nir, nir_lower_global_vars_to_local); - NIR_PASS_V(nir, nir_lower_var_copies); - NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_global_vars_to_local); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); - NIR_PASS_V(nir, pan_lower_framebuffer, - inputs->rt_formats, inputs->raw_fmt_mask, - inputs->is_blend, ctx->quirks & MIDGARD_BROKEN_BLEND_LOADS); + NIR_PASS_V(nir, pan_lower_framebuffer, inputs->rt_formats, + inputs->raw_fmt_mask, inputs->is_blend, + ctx->quirks & MIDGARD_BROKEN_BLEND_LOADS); - NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, - glsl_type_size, 0); + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + glsl_type_size, 0); - if (ctx->stage == MESA_SHADER_VERTEX) { - /* nir_lower[_explicit]_io is lazy and emits mul+add chains even - * for offsets it could figure out are constant. Do some - * constant folding before pan_nir_lower_store_component below. - */ - NIR_PASS_V(nir, nir_opt_constant_folding); - NIR_PASS_V(nir, pan_nir_lower_store_component); - } + if (ctx->stage == MESA_SHADER_VERTEX) { + /* nir_lower[_explicit]_io is lazy and emits mul+add chains even + * for offsets it could figure out are constant. Do some + * constant folding before pan_nir_lower_store_component below. + */ + NIR_PASS_V(nir, nir_opt_constant_folding); + NIR_PASS_V(nir, pan_nir_lower_store_component); + } - NIR_PASS_V(nir, nir_lower_ssbo); - NIR_PASS_V(nir, pan_nir_lower_zs_store); + NIR_PASS_V(nir, nir_lower_ssbo); + NIR_PASS_V(nir, pan_nir_lower_zs_store); - NIR_PASS_V(nir, pan_nir_lower_64bit_intrin); + NIR_PASS_V(nir, pan_nir_lower_64bit_intrin); - NIR_PASS_V(nir, midgard_nir_lower_global_load); + NIR_PASS_V(nir, midgard_nir_lower_global_load); - /* Collect varyings after lowering I/O */ - pan_nir_collect_varyings(nir, info); + /* Collect varyings after lowering I/O */ + pan_nir_collect_varyings(nir, info); - /* Optimisation passes */ + /* Optimisation passes */ - optimise_nir(nir, ctx->quirks, inputs->is_blend, inputs->is_blit); + optimise_nir(nir, ctx->quirks, inputs->is_blend, inputs->is_blit); - bool skip_internal = nir->info.internal; - skip_internal &= !(midgard_debug & MIDGARD_DBG_INTERNAL); + bool skip_internal = nir->info.internal; + skip_internal &= !(midgard_debug & MIDGARD_DBG_INTERNAL); + + if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) + nir_print_shader(nir, stdout); + + info->tls_size = nir->scratch_size; - if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) - nir_print_shader(nir, stdout); + nir_foreach_function(func, nir) { + if (!func->impl) + continue; - info->tls_size = nir->scratch_size; + list_inithead(&ctx->blocks); + ctx->block_count = 0; + ctx->func = func; + ctx->already_emitted = + calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD)); + + if (nir->info.outputs_read && !inputs->is_blend) { + emit_block_init(ctx); - nir_foreach_function(func, nir) { - if (!func->impl) - continue; + struct midgard_instruction wait = v_branch(false, false); + wait.branch.target_type = TARGET_TILEBUF_WAIT; - list_inithead(&ctx->blocks); - ctx->block_count = 0; - ctx->func = func; - ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD)); + emit_mir_instruction(ctx, wait); + + ++ctx->instruction_count; + } - if (nir->info.outputs_read && !inputs->is_blend) { - emit_block_init(ctx); + emit_cf_list(ctx, &func->impl->body); + free(ctx->already_emitted); + break; /* TODO: Multi-function shaders */ + } - struct midgard_instruction wait = v_branch(false, false); - wait.branch.target_type = TARGET_TILEBUF_WAIT; + /* Per-block lowering before opts */ - emit_mir_instruction(ctx, wait); + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + inline_alu_constants(ctx, block); + embedded_to_inline_constant(ctx, block); + } + /* MIR-level optimizations */ - ++ctx->instruction_count; - } + bool progress = false; - emit_cf_list(ctx, &func->impl->body); - free(ctx->already_emitted); - break; /* TODO: Multi-function shaders */ - } + do { + progress = false; + progress |= midgard_opt_dead_code_eliminate(ctx); - /* Per-block lowering before opts */ + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + progress |= midgard_opt_copy_prop(ctx, block); + progress |= midgard_opt_combine_projection(ctx, block); + progress |= midgard_opt_varying_projection(ctx, block); + } + } while (progress); - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - inline_alu_constants(ctx, block); - embedded_to_inline_constant(ctx, block); - } - /* MIR-level optimizations */ + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + midgard_lower_derivatives(ctx, block); + midgard_legalize_invert(ctx, block); + midgard_cull_dead_branch(ctx, block); + } - bool progress = false; + if (ctx->stage == MESA_SHADER_FRAGMENT) + mir_add_writeout_loops(ctx); - do { - progress = false; - progress |= midgard_opt_dead_code_eliminate(ctx); + /* Analyze now that the code is known but before scheduling creates + * pipeline registers which are harder to track */ + mir_analyze_helper_requirements(ctx); - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - progress |= midgard_opt_copy_prop(ctx, block); - progress |= midgard_opt_combine_projection(ctx, block); - progress |= midgard_opt_varying_projection(ctx, block); - } - } while (progress); + if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) + mir_print_shader(ctx); - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - midgard_lower_derivatives(ctx, block); - midgard_legalize_invert(ctx, block); - midgard_cull_dead_branch(ctx, block); - } + /* Schedule! */ + midgard_schedule_program(ctx); + mir_ra(ctx); - if (ctx->stage == MESA_SHADER_FRAGMENT) - mir_add_writeout_loops(ctx); + if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) + mir_print_shader(ctx); - /* Analyze now that the code is known but before scheduling creates - * pipeline registers which are harder to track */ - mir_analyze_helper_requirements(ctx); + /* Analyze after scheduling since this is order-dependent */ + mir_analyze_helper_terminate(ctx); - if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) - mir_print_shader(ctx); + /* Emit flat binary from the instruction arrays. Iterate each block in + * sequence. Save instruction boundaries such that lookahead tags can + * be assigned easily */ - /* Schedule! */ - midgard_schedule_program(ctx); - mir_ra(ctx); + /* Cache _all_ bundles in source order for lookahead across failed branches */ - if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) - mir_print_shader(ctx); + int bundle_count = 0; + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + bundle_count += block->bundles.size / sizeof(midgard_bundle); + } + midgard_bundle **source_order_bundles = + malloc(sizeof(midgard_bundle *) * bundle_count); + int bundle_idx = 0; + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { + source_order_bundles[bundle_idx++] = bundle; + } + } - /* Analyze after scheduling since this is order-dependent */ - mir_analyze_helper_terminate(ctx); + int current_bundle = 0; - /* Emit flat binary from the instruction arrays. Iterate each block in - * sequence. Save instruction boundaries such that lookahead tags can - * be assigned easily */ + /* Midgard prefetches instruction types, so during emission we + * need to lookahead. Unless this is the last instruction, in + * which we return 1. */ - /* Cache _all_ bundles in source order for lookahead across failed branches */ + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + mir_foreach_bundle_in_block(block, bundle) { + int lookahead = 1; - int bundle_count = 0; - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - bundle_count += block->bundles.size / sizeof(midgard_bundle); - } - midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count); - int bundle_idx = 0; - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { - source_order_bundles[bundle_idx++] = bundle; - } - } + if (!bundle->last_writeout && (current_bundle + 1 < bundle_count)) + lookahead = source_order_bundles[current_bundle + 1]->tag; - int current_bundle = 0; + emit_binary_bundle(ctx, block, bundle, binary, lookahead); + ++current_bundle; + } - /* Midgard prefetches instruction types, so during emission we - * need to lookahead. Unless this is the last instruction, in - * which we return 1. */ + /* TODO: Free deeper */ + // util_dynarray_fini(&block->instructions); + } - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - mir_foreach_bundle_in_block(block, bundle) { - int lookahead = 1; + free(source_order_bundles); - if (!bundle->last_writeout && (current_bundle + 1 < bundle_count)) - lookahead = source_order_bundles[current_bundle + 1]->tag; + /* Report the very first tag executed */ + info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0); - emit_binary_bundle(ctx, block, bundle, binary, lookahead); - ++current_bundle; - } + info->ubo_mask = ctx->ubo_mask & ((1 << ctx->nir->info.num_ubos) - 1); - /* TODO: Free deeper */ - //util_dynarray_fini(&block->instructions); - } + if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) { + disassemble_midgard(stdout, binary->data, binary->size, inputs->gpu_id, + midgard_debug & MIDGARD_DBG_VERBOSE); + fflush(stdout); + } - free(source_order_bundles); - - /* Report the very first tag executed */ - info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0); - - info->ubo_mask = ctx->ubo_mask & ((1 << ctx->nir->info.num_ubos) - 1); + /* A shader ending on a 16MB boundary causes INSTR_INVALID_PC faults, + * workaround by adding some padding to the end of the shader. (The + * kernel makes sure shader BOs can't cross 16MB boundaries.) */ + if (binary->size) + memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16); + + if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->debug) && + !nir->info.internal) { + unsigned nr_bundles = 0, nr_ins = 0; + + /* Count instructions and bundles */ - if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) { - disassemble_midgard(stdout, binary->data, - binary->size, inputs->gpu_id, - midgard_debug & MIDGARD_DBG_VERBOSE); - fflush(stdout); - } - - /* A shader ending on a 16MB boundary causes INSTR_INVALID_PC faults, - * workaround by adding some padding to the end of the shader. (The - * kernel makes sure shader BOs can't cross 16MB boundaries.) */ - if (binary->size) - memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16); - - if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->debug) && - !nir->info.internal) { - unsigned nr_bundles = 0, nr_ins = 0; - - /* Count instructions and bundles */ - - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - nr_bundles += util_dynarray_num_elements( - &block->bundles, midgard_bundle); + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + nr_bundles += + util_dynarray_num_elements(&block->bundles, midgard_bundle); - mir_foreach_bundle_in_block(block, bun) - nr_ins += bun->instruction_count; - } + mir_foreach_bundle_in_block(block, bun) + nr_ins += bun->instruction_count; + } - /* Calculate thread count. There are certain cutoffs by - * register count for thread count */ + /* Calculate thread count. There are certain cutoffs by + * register count for thread count */ - unsigned nr_registers = info->work_reg_count; + unsigned nr_registers = info->work_reg_count; - unsigned nr_threads = - (nr_registers <= 4) ? 4 : - (nr_registers <= 8) ? 2 : - 1; + unsigned nr_threads = (nr_registers <= 4) ? 4 + : (nr_registers <= 8) ? 2 + : 1; - char *shaderdb = NULL; + char *shaderdb = NULL; - /* Dump stats */ + /* Dump stats */ - asprintf(&shaderdb, "%s shader: " - "%u inst, %u bundles, %u quadwords, " - "%u registers, %u threads, %u loops, " - "%u:%u spills:fills", - ctx->inputs->is_blend ? "PAN_SHADER_BLEND" : - gl_shader_stage_name(ctx->stage), - nr_ins, nr_bundles, ctx->quadword_count, - nr_registers, nr_threads, - ctx->loop_count, - ctx->spills, ctx->fills); + asprintf(&shaderdb, + "%s shader: " + "%u inst, %u bundles, %u quadwords, " + "%u registers, %u threads, %u loops, " + "%u:%u spills:fills", + ctx->inputs->is_blend ? "PAN_SHADER_BLEND" + : gl_shader_stage_name(ctx->stage), + nr_ins, nr_bundles, ctx->quadword_count, nr_registers, + nr_threads, ctx->loop_count, ctx->spills, ctx->fills); - if (midgard_debug & MIDGARD_DBG_SHADERDB) - fprintf(stderr, "SHADER-DB: %s\n", shaderdb); + if (midgard_debug & MIDGARD_DBG_SHADERDB) + fprintf(stderr, "SHADER-DB: %s\n", shaderdb); - if (inputs->debug) - util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb); - - free(shaderdb); - } + if (inputs->debug) + util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb); - _mesa_hash_table_u64_destroy(ctx->ssa_constants); - _mesa_hash_table_u64_destroy(ctx->sysval_to_id); + free(shaderdb); + } - ralloc_free(ctx); + _mesa_hash_table_u64_destroy(ctx->ssa_constants); + _mesa_hash_table_u64_destroy(ctx->sysval_to_id); + + ralloc_free(ctx); } diff --git a/src/panfrost/midgard/midgard_compile.h b/src/panfrost/midgard/midgard_compile.h index bced01dde9c..1b88eb11339 100644 --- a/src/panfrost/midgard/midgard_compile.h +++ b/src/panfrost/midgard/midgard_compile.h @@ -26,81 +26,81 @@ #define __MIDGARD_H_ #include "compiler/nir/nir.h" -#include "util/u_dynarray.h" #include "panfrost/util/pan_ir.h" +#include "util/u_dynarray.h" -void -midgard_compile_shader_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs, - struct util_dynarray *binary, - struct pan_shader_info *info); +void midgard_compile_shader_nir(nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info); /* NIR options are shared between the standalone compiler and the online * compiler. Defining it here is the simplest, though maybe not the Right * solution. */ static const nir_shader_compiler_options midgard_nir_options = { - .lower_ffma16 = true, - .lower_ffma32 = true, - .lower_ffma64 = true, - .lower_scmp = true, - .lower_flrp16 = true, - .lower_flrp32 = true, - .lower_flrp64 = true, - .lower_ffract = true, - .lower_fmod = true, - .lower_fdiv = true, - .lower_isign = true, - .lower_fpow = true, - .lower_find_lsb = true, - .lower_ifind_msb = true, - .lower_fdph = true, - .lower_uadd_carry = true, - .lower_usub_borrow = true, + .lower_ffma16 = true, + .lower_ffma32 = true, + .lower_ffma64 = true, + .lower_scmp = true, + .lower_flrp16 = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_ffract = true, + .lower_fmod = true, + .lower_fdiv = true, + .lower_isign = true, + .lower_fpow = true, + .lower_find_lsb = true, + .lower_ifind_msb = true, + .lower_fdph = true, + .lower_uadd_carry = true, + .lower_usub_borrow = true, - /* TODO: We have native ops to help here, which we'll want to look into - * eventually */ - .lower_fsign = true, + /* TODO: We have native ops to help here, which we'll want to look into + * eventually */ + .lower_fsign = true, - .lower_bit_count = true, - .lower_bitfield_reverse = true, - .lower_bitfield_insert_to_shifts = true, - .lower_bitfield_extract_to_shifts = true, - .lower_extract_byte = true, - .lower_extract_word = true, - .lower_insert_byte = true, - .lower_insert_word = true, - .lower_rotate = true, + .lower_bit_count = true, + .lower_bitfield_reverse = true, + .lower_bitfield_insert_to_shifts = true, + .lower_bitfield_extract_to_shifts = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, + .lower_rotate = true, - .lower_pack_half_2x16 = true, - .lower_pack_unorm_2x16 = true, - .lower_pack_snorm_2x16 = true, - .lower_pack_unorm_4x8 = true, - .lower_pack_snorm_4x8 = true, - .lower_unpack_half_2x16 = true, - .lower_unpack_unorm_2x16 = true, - .lower_unpack_snorm_2x16 = true, - .lower_unpack_unorm_4x8 = true, - .lower_unpack_snorm_4x8 = true, - .lower_pack_split = true, - .lower_pack_64_2x32_split = true, - .lower_unpack_64_2x32_split = true, - .lower_int64_options = nir_lower_imul_2x32_64, + .lower_pack_half_2x16 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_pack_snorm_4x8 = true, + .lower_unpack_half_2x16 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_unpack_snorm_4x8 = true, + .lower_pack_split = true, + .lower_pack_64_2x32_split = true, + .lower_unpack_64_2x32_split = true, + .lower_int64_options = nir_lower_imul_2x32_64, - .lower_doubles_options = nir_lower_dmod, + .lower_doubles_options = nir_lower_dmod, - .lower_uniforms_to_ubo = true, - .has_fsub = true, - .has_isub = true, - .vectorize_io = true, - .use_interpolated_input_intrinsics = true, + .lower_uniforms_to_ubo = true, + .has_fsub = true, + .has_isub = true, + .vectorize_io = true, + .use_interpolated_input_intrinsics = true, - .vertex_id_zero_based = true, - .has_cs_global_id = true, - .lower_cs_local_index_to_id = true, - .max_unroll_iterations = 32, - .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp), - .force_indirect_unrolling_sampler = true, + .vertex_id_zero_based = true, + .has_cs_global_id = true, + .lower_cs_local_index_to_id = true, + .max_unroll_iterations = 32, + .force_indirect_unrolling = + (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp), + .force_indirect_unrolling_sampler = true, }; #endif diff --git a/src/panfrost/midgard/midgard_derivatives.c b/src/panfrost/midgard/midgard_derivatives.c index 5ad2e378a83..d243a00bdd2 100644 --- a/src/panfrost/midgard/midgard_derivatives.c +++ b/src/panfrost/midgard/midgard_derivatives.c @@ -53,20 +53,20 @@ static unsigned mir_derivative_mode(nir_op op) { - switch (op) { - case nir_op_fddx: - case nir_op_fddx_fine: - case nir_op_fddx_coarse: - return TEXTURE_DFDX; + switch (op) { + case nir_op_fddx: + case nir_op_fddx_fine: + case nir_op_fddx_coarse: + return TEXTURE_DFDX; - case nir_op_fddy: - case nir_op_fddy_fine: - case nir_op_fddy_coarse: - return TEXTURE_DFDY; + case nir_op_fddy: + case nir_op_fddy_fine: + case nir_op_fddy_coarse: + return TEXTURE_DFDY; - default: - unreachable("Invalid derivative op"); - } + default: + unreachable("Invalid derivative op"); + } } /* Returns true if a texturing op computes derivatives either explicitly or @@ -75,91 +75,105 @@ mir_derivative_mode(nir_op op) bool mir_op_computes_derivatives(gl_shader_stage stage, unsigned op) { - /* Only fragment shaders may compute derivatives, but the sense of - * "normal" changes in vertex shaders on certain GPUs */ + /* Only fragment shaders may compute derivatives, but the sense of + * "normal" changes in vertex shaders on certain GPUs */ - if (op == midgard_tex_op_normal && stage != MESA_SHADER_FRAGMENT) - return false; + if (op == midgard_tex_op_normal && stage != MESA_SHADER_FRAGMENT) + return false; - switch (op) { - case midgard_tex_op_normal: - case midgard_tex_op_derivative: - assert(stage == MESA_SHADER_FRAGMENT); - return true; - default: - return false; - } + switch (op) { + case midgard_tex_op_normal: + case midgard_tex_op_derivative: + assert(stage == MESA_SHADER_FRAGMENT); + return true; + default: + return false; + } } void midgard_emit_derivatives(compiler_context *ctx, nir_alu_instr *instr) { - /* Create texture instructions */ + /* Create texture instructions */ - unsigned nr_components = nir_dest_num_components(instr->dest.dest); + unsigned nr_components = nir_dest_num_components(instr->dest.dest); - midgard_instruction ins = { - .type = TAG_TEXTURE_4, - .mask = mask_of(nr_components), - .dest = nir_dest_index(&instr->dest.dest), - .dest_type = nir_type_float32, - .src = { ~0, nir_src_index(ctx, &instr->src[0].src), ~0, ~0, }, - .swizzle = SWIZZLE_IDENTITY_4, - .src_types = { nir_type_float32, nir_type_float32, }, - .op = midgard_tex_op_derivative, - .texture = { - .mode = mir_derivative_mode(instr->op), - .format = 2, - .in_reg_full = 1, - .out_full = 1, - .sampler_type = MALI_SAMPLER_FLOAT, - }, - }; + midgard_instruction ins = { + .type = TAG_TEXTURE_4, + .mask = mask_of(nr_components), + .dest = nir_dest_index(&instr->dest.dest), + .dest_type = nir_type_float32, + .src = + { + ~0, + nir_src_index(ctx, &instr->src[0].src), + ~0, + ~0, + }, + .swizzle = SWIZZLE_IDENTITY_4, + .src_types = + { + nir_type_float32, + nir_type_float32, + }, + .op = midgard_tex_op_derivative, + .texture = + { + .mode = mir_derivative_mode(instr->op), + .format = 2, + .in_reg_full = 1, + .out_full = 1, + .sampler_type = MALI_SAMPLER_FLOAT, + }, + }; - if (!instr->dest.dest.is_ssa) - ins.mask &= instr->dest.write_mask; + if (!instr->dest.dest.is_ssa) + ins.mask &= instr->dest.write_mask; - emit_mir_instruction(ctx, ins); + emit_mir_instruction(ctx, ins); } void midgard_lower_derivatives(compiler_context *ctx, midgard_block *block) { - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->type != TAG_TEXTURE_4) continue; - if (ins->op != midgard_tex_op_derivative) continue; + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_TEXTURE_4) + continue; + if (ins->op != midgard_tex_op_derivative) + continue; - /* Check if we need to split */ + /* Check if we need to split */ - bool upper = ins->mask & 0b1100; - bool lower = ins->mask & 0b0011; + bool upper = ins->mask & 0b1100; + bool lower = ins->mask & 0b0011; - if (!(upper && lower)) continue; + if (!(upper && lower)) + continue; - /* Duplicate for dedicated upper instruction */ + /* Duplicate for dedicated upper instruction */ - midgard_instruction dup; - memcpy(&dup, ins, sizeof(dup)); + midgard_instruction dup; + memcpy(&dup, ins, sizeof(dup)); - /* Fixup masks. Make original just lower and dupe just upper */ + /* Fixup masks. Make original just lower and dupe just upper */ - ins->mask &= 0b0011; - dup.mask &= 0b1100; + ins->mask &= 0b0011; + dup.mask &= 0b1100; - /* Fixup swizzles */ - dup.swizzle[0][0] = dup.swizzle[0][1] = dup.swizzle[0][2] = COMPONENT_X; - dup.swizzle[0][3] = COMPONENT_Y; + /* Fixup swizzles */ + dup.swizzle[0][0] = dup.swizzle[0][1] = dup.swizzle[0][2] = COMPONENT_X; + dup.swizzle[0][3] = COMPONENT_Y; - dup.swizzle[1][0] = COMPONENT_Z; - dup.swizzle[1][1] = dup.swizzle[1][2] = dup.swizzle[1][3] = COMPONENT_W; + dup.swizzle[1][0] = COMPONENT_Z; + dup.swizzle[1][1] = dup.swizzle[1][2] = dup.swizzle[1][3] = COMPONENT_W; - /* Insert the new instruction */ - mir_insert_instruction_before(ctx, mir_next_op(ins), dup); + /* Insert the new instruction */ + mir_insert_instruction_before(ctx, mir_next_op(ins), dup); - /* We'll need both instructions to write to the same index, so - * rewrite to use a register */ + /* We'll need both instructions to write to the same index, so + * rewrite to use a register */ - unsigned new = make_compiler_temp_reg(ctx); - mir_rewrite_index(ctx, ins->dest, new); - } + unsigned new = make_compiler_temp_reg(ctx); + mir_rewrite_index(ctx, ins->dest, new); + } } diff --git a/src/panfrost/midgard/midgard_emit.c b/src/panfrost/midgard/midgard_emit.c index 92c6dd11dab..7839760ba5e 100644 --- a/src/panfrost/midgard/midgard_emit.c +++ b/src/panfrost/midgard/midgard_emit.c @@ -29,63 +29,65 @@ static midgard_int_mod mir_get_imod(bool shift, nir_alu_type T, bool half, bool scalar) { - if (!half) { - assert(!shift); - /* Doesn't matter, src mods are only used when expanding */ - return midgard_int_sign_extend; - } + if (!half) { + assert(!shift); + /* Doesn't matter, src mods are only used when expanding */ + return midgard_int_sign_extend; + } - if (shift) - return midgard_int_left_shift; + if (shift) + return midgard_int_left_shift; - if (nir_alu_type_get_base_type(T) == nir_type_int) - return midgard_int_sign_extend; - else - return midgard_int_zero_extend; + if (nir_alu_type_get_base_type(T) == nir_type_int) + return midgard_int_sign_extend; + else + return midgard_int_zero_extend; } void midgard_pack_ubo_index_imm(midgard_load_store_word *word, unsigned index) { - word->arg_comp = index & 0x3; - word->arg_reg = (index >> 2) & 0x7; - word->bitsize_toggle = (index >> 5) & 0x1; - word->index_format = (index >> 6) & 0x3; + word->arg_comp = index & 0x3; + word->arg_reg = (index >> 2) & 0x7; + word->bitsize_toggle = (index >> 5) & 0x1; + word->index_format = (index >> 6) & 0x3; } -void midgard_pack_varying_params(midgard_load_store_word *word, midgard_varying_params p) +void +midgard_pack_varying_params(midgard_load_store_word *word, + midgard_varying_params p) { - /* Currently these parameters are not supported. */ - assert(p.direct_sample_pos_x == 0 && p.direct_sample_pos_y == 0); + /* Currently these parameters are not supported. */ + assert(p.direct_sample_pos_x == 0 && p.direct_sample_pos_y == 0); - unsigned u; - memcpy(&u, &p, sizeof(p)); + unsigned u; + memcpy(&u, &p, sizeof(p)); - word->signed_offset |= u & 0x1FF; + word->signed_offset |= u & 0x1FF; } -midgard_varying_params midgard_unpack_varying_params(midgard_load_store_word word) +midgard_varying_params +midgard_unpack_varying_params(midgard_load_store_word word) { - unsigned params = word.signed_offset & 0x1FF; + unsigned params = word.signed_offset & 0x1FF; - midgard_varying_params p; - memcpy(&p, ¶ms, sizeof(p)); + midgard_varying_params p; + memcpy(&p, ¶ms, sizeof(p)); - return p; + return p; } unsigned mir_pack_mod(midgard_instruction *ins, unsigned i, bool scalar) { - bool integer = midgard_is_integer_op(ins->op); - unsigned base_size = max_bitsize_for_alu(ins); - unsigned sz = nir_alu_type_get_type_size(ins->src_types[i]); - bool half = (sz == (base_size >> 1)); + bool integer = midgard_is_integer_op(ins->op); + unsigned base_size = max_bitsize_for_alu(ins); + unsigned sz = nir_alu_type_get_type_size(ins->src_types[i]); + bool half = (sz == (base_size >> 1)); - return integer ? - mir_get_imod(ins->src_shift[i], ins->src_types[i], half, scalar) : - ((ins->src_abs[i] << 0) | - ((ins->src_neg[i] << 1))); + return integer + ? mir_get_imod(ins->src_shift[i], ins->src_types[i], half, scalar) + : ((ins->src_abs[i] << 0) | ((ins->src_neg[i] << 1))); } /* Midgard IR only knows vector ALU types, but we sometimes need to actually @@ -95,75 +97,76 @@ mir_pack_mod(midgard_instruction *ins, unsigned i, bool scalar) static int component_from_mask(unsigned mask) { - for (int c = 0; c < 8; ++c) { - if (mask & (1 << c)) - return c; - } + for (int c = 0; c < 8; ++c) { + if (mask & (1 << c)) + return c; + } - assert(0); - return 0; + assert(0); + return 0; } static unsigned mir_pack_scalar_source(unsigned mod, bool is_full, unsigned component) { - midgard_scalar_alu_src s = { - .mod = mod, - .full = is_full, - .component = component << (is_full ? 1 : 0), - }; + midgard_scalar_alu_src s = { + .mod = mod, + .full = is_full, + .component = component << (is_full ? 1 : 0), + }; - unsigned o; - memcpy(&o, &s, sizeof(s)); + unsigned o; + memcpy(&o, &s, sizeof(s)); - return o & ((1 << 6) - 1); + return o & ((1 << 6) - 1); } static midgard_scalar_alu vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins) { - bool is_full = nir_alu_type_get_type_size(ins->dest_type) == 32; + bool is_full = nir_alu_type_get_type_size(ins->dest_type) == 32; - bool half_0 = nir_alu_type_get_type_size(ins->src_types[0]) == 16; - bool half_1 = nir_alu_type_get_type_size(ins->src_types[1]) == 16; - unsigned comp = component_from_mask(ins->mask); + bool half_0 = nir_alu_type_get_type_size(ins->src_types[0]) == 16; + bool half_1 = nir_alu_type_get_type_size(ins->src_types[1]) == 16; + unsigned comp = component_from_mask(ins->mask); - unsigned packed_src[2] = { - mir_pack_scalar_source(mir_pack_mod(ins, 0, true), !half_0, ins->swizzle[0][comp]), - mir_pack_scalar_source(mir_pack_mod(ins, 1, true), !half_1, ins->swizzle[1][comp]) - }; + unsigned packed_src[2] = { + mir_pack_scalar_source(mir_pack_mod(ins, 0, true), !half_0, + ins->swizzle[0][comp]), + mir_pack_scalar_source(mir_pack_mod(ins, 1, true), !half_1, + ins->swizzle[1][comp])}; - /* The output component is from the mask */ - midgard_scalar_alu s = { - .op = v.op, - .src1 = packed_src[0], - .src2 = packed_src[1], - .outmod = v.outmod, - .output_full = is_full, - .output_component = comp, - }; + /* The output component is from the mask */ + midgard_scalar_alu s = { + .op = v.op, + .src1 = packed_src[0], + .src2 = packed_src[1], + .outmod = v.outmod, + .output_full = is_full, + .output_component = comp, + }; - /* Full components are physically spaced out */ - if (is_full) { - assert(s.output_component < 4); - s.output_component <<= 1; - } + /* Full components are physically spaced out */ + if (is_full) { + assert(s.output_component < 4); + s.output_component <<= 1; + } - /* Inline constant is passed along rather than trying to extract it - * from v */ + /* Inline constant is passed along rather than trying to extract it + * from v */ - if (ins->has_inline_constant) { - uint16_t imm = 0; - int lower_11 = ins->inline_constant & ((1 << 12) - 1); - imm |= (lower_11 >> 9) & 3; - imm |= (lower_11 >> 6) & 4; - imm |= (lower_11 >> 2) & 0x38; - imm |= (lower_11 & 63) << 6; + if (ins->has_inline_constant) { + uint16_t imm = 0; + int lower_11 = ins->inline_constant & ((1 << 12) - 1); + imm |= (lower_11 >> 9) & 3; + imm |= (lower_11 >> 6) & 4; + imm |= (lower_11 >> 2) & 0x38; + imm |= (lower_11 & 63) << 6; - s.src2 = imm; - } + s.src2 = imm; + } - return s; + return s; } /* 64-bit swizzles are super easy since there are 2 components of 2 components @@ -176,238 +179,230 @@ vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins) * with rep. Pretty nifty, huh? */ static unsigned -mir_pack_swizzle_64(unsigned *swizzle, unsigned max_component, - bool expand_high) +mir_pack_swizzle_64(unsigned *swizzle, unsigned max_component, bool expand_high) { - unsigned packed = 0; - unsigned base = expand_high ? 2 : 0; + unsigned packed = 0; + unsigned base = expand_high ? 2 : 0; - for (unsigned i = base; i < base + 2; ++i) { - assert(swizzle[i] <= max_component); + for (unsigned i = base; i < base + 2; ++i) { + assert(swizzle[i] <= max_component); - unsigned a = (swizzle[i] & 1) ? - (COMPONENT_W << 2) | COMPONENT_Z : - (COMPONENT_Y << 2) | COMPONENT_X; + unsigned a = (swizzle[i] & 1) ? (COMPONENT_W << 2) | COMPONENT_Z + : (COMPONENT_Y << 2) | COMPONENT_X; - if (i & 1) - packed |= a << 4; - else - packed |= a; - } + if (i & 1) + packed |= a << 4; + else + packed |= a; + } - return packed; + return packed; } static void mir_pack_mask_alu(midgard_instruction *ins, midgard_vector_alu *alu) { - unsigned effective = ins->mask; + unsigned effective = ins->mask; - /* If we have a destination override, we need to figure out whether to - * override to the lower or upper half, shifting the effective mask in - * the latter, so AAAA.... becomes AAAA */ + /* If we have a destination override, we need to figure out whether to + * override to the lower or upper half, shifting the effective mask in + * the latter, so AAAA.... becomes AAAA */ - unsigned inst_size = max_bitsize_for_alu(ins); - signed upper_shift = mir_upper_override(ins, inst_size); + unsigned inst_size = max_bitsize_for_alu(ins); + signed upper_shift = mir_upper_override(ins, inst_size); - if (upper_shift >= 0) { - effective >>= upper_shift; - alu->shrink_mode = upper_shift ? - midgard_shrink_mode_upper : - midgard_shrink_mode_lower; - } else { - alu->shrink_mode = midgard_shrink_mode_none; - } + if (upper_shift >= 0) { + effective >>= upper_shift; + alu->shrink_mode = + upper_shift ? midgard_shrink_mode_upper : midgard_shrink_mode_lower; + } else { + alu->shrink_mode = midgard_shrink_mode_none; + } - if (inst_size == 32) - alu->mask = expand_writemask(effective, 2); - else if (inst_size == 64) - alu->mask = expand_writemask(effective, 1); - else - alu->mask = effective; + if (inst_size == 32) + alu->mask = expand_writemask(effective, 2); + else if (inst_size == 64) + alu->mask = expand_writemask(effective, 1); + else + alu->mask = effective; } static unsigned -mir_pack_swizzle(unsigned mask, unsigned *swizzle, - unsigned sz, unsigned base_size, - bool op_channeled, midgard_src_expand_mode *expand_mode) +mir_pack_swizzle(unsigned mask, unsigned *swizzle, unsigned sz, + unsigned base_size, bool op_channeled, + midgard_src_expand_mode *expand_mode) { - unsigned packed = 0; + unsigned packed = 0; - *expand_mode = midgard_src_passthrough; + *expand_mode = midgard_src_passthrough; - midgard_reg_mode reg_mode = reg_mode_for_bitsize(base_size); + midgard_reg_mode reg_mode = reg_mode_for_bitsize(base_size); - if (reg_mode == midgard_reg_mode_64) { - assert(sz == 64 || sz == 32); - unsigned components = (sz == 32) ? 4 : 2; + if (reg_mode == midgard_reg_mode_64) { + assert(sz == 64 || sz == 32); + unsigned components = (sz == 32) ? 4 : 2; - packed = mir_pack_swizzle_64(swizzle, components, - mask & 0xc); + packed = mir_pack_swizzle_64(swizzle, components, mask & 0xc); - if (sz == 32) { - ASSERTED bool dontcare = true; - bool hi = false; + if (sz == 32) { + ASSERTED bool dontcare = true; + bool hi = false; - assert(util_bitcount(mask) <= 2); + assert(util_bitcount(mask) <= 2); - u_foreach_bit(i, mask) { - bool hi_i = swizzle[i] >= COMPONENT_Z; + u_foreach_bit(i, mask) { + bool hi_i = swizzle[i] >= COMPONENT_Z; - /* We can't mix halves */ - assert(dontcare || (hi == hi_i)); - hi = hi_i; - dontcare = false; - } + /* We can't mix halves */ + assert(dontcare || (hi == hi_i)); + hi = hi_i; + dontcare = false; + } - *expand_mode = hi ? midgard_src_expand_high : - midgard_src_expand_low; - } else if (sz < 32) { - unreachable("Cannot encode 8/16 swizzle in 64-bit"); - } - } else { - /* For 32-bit, swizzle packing is stupid-simple. For 16-bit, - * the strategy is to check whether the nibble we're on is - * upper or lower. We need all components to be on the same - * "side"; that much is enforced by the ISA and should have - * been lowered. TODO: 8-bit packing. TODO: vec8 */ + *expand_mode = hi ? midgard_src_expand_high : midgard_src_expand_low; + } else if (sz < 32) { + unreachable("Cannot encode 8/16 swizzle in 64-bit"); + } + } else { + /* For 32-bit, swizzle packing is stupid-simple. For 16-bit, + * the strategy is to check whether the nibble we're on is + * upper or lower. We need all components to be on the same + * "side"; that much is enforced by the ISA and should have + * been lowered. TODO: 8-bit packing. TODO: vec8 */ - unsigned first = mask ? ffs(mask) - 1 : 0; - bool upper = swizzle[first] > 3; + unsigned first = mask ? ffs(mask) - 1 : 0; + bool upper = swizzle[first] > 3; - if (upper && mask) - assert(sz <= 16); + if (upper && mask) + assert(sz <= 16); - bool dest_up = !op_channeled && (first >= 4); + bool dest_up = !op_channeled && (first >= 4); - for (unsigned c = (dest_up ? 4 : 0); c < (dest_up ? 8 : 4); ++c) { - unsigned v = swizzle[c]; + for (unsigned c = (dest_up ? 4 : 0); c < (dest_up ? 8 : 4); ++c) { + unsigned v = swizzle[c]; - ASSERTED bool t_upper = v > (sz == 8 ? 7 : 3); + ASSERTED bool t_upper = v > (sz == 8 ? 7 : 3); - /* Ensure we're doing something sane */ + /* Ensure we're doing something sane */ - if (mask & (1 << c)) { - assert(t_upper == upper); - assert(v <= (sz == 8 ? 15 : 7)); - } + if (mask & (1 << c)) { + assert(t_upper == upper); + assert(v <= (sz == 8 ? 15 : 7)); + } - /* Use the non upper part */ - v &= 0x3; + /* Use the non upper part */ + v &= 0x3; - packed |= v << (2 * (c % 4)); - } + packed |= v << (2 * (c % 4)); + } + /* Replicate for now.. should really pick a side for + * dot products */ - /* Replicate for now.. should really pick a side for - * dot products */ + if (reg_mode == midgard_reg_mode_16 && sz == 16) { + *expand_mode = upper ? midgard_src_rep_high : midgard_src_rep_low; + } else if (reg_mode == midgard_reg_mode_16 && sz == 8) { + if (base_size == 16) { + *expand_mode = + upper ? midgard_src_expand_high : midgard_src_expand_low; + } else if (upper) { + *expand_mode = midgard_src_swap; + } + } else if (reg_mode == midgard_reg_mode_32 && sz == 16) { + *expand_mode = + upper ? midgard_src_expand_high : midgard_src_expand_low; + } else if (reg_mode == midgard_reg_mode_8) { + unreachable("Unhandled reg mode"); + } + } - if (reg_mode == midgard_reg_mode_16 && sz == 16) { - *expand_mode = upper ? midgard_src_rep_high : - midgard_src_rep_low; - } else if (reg_mode == midgard_reg_mode_16 && sz == 8) { - if (base_size == 16) { - *expand_mode = upper ? midgard_src_expand_high : - midgard_src_expand_low; - } else if (upper) { - *expand_mode = midgard_src_swap; - } - } else if (reg_mode == midgard_reg_mode_32 && sz == 16) { - *expand_mode = upper ? midgard_src_expand_high : - midgard_src_expand_low; - } else if (reg_mode == midgard_reg_mode_8) { - unreachable("Unhandled reg mode"); - } - } - - return packed; + return packed; } static void mir_pack_vector_srcs(midgard_instruction *ins, midgard_vector_alu *alu) { - bool channeled = GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props); + bool channeled = GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props); - unsigned base_size = max_bitsize_for_alu(ins); + unsigned base_size = max_bitsize_for_alu(ins); - for (unsigned i = 0; i < 2; ++i) { - if (ins->has_inline_constant && (i == 1)) - continue; + for (unsigned i = 0; i < 2; ++i) { + if (ins->has_inline_constant && (i == 1)) + continue; - if (ins->src[i] == ~0) - continue; + if (ins->src[i] == ~0) + continue; - unsigned sz = nir_alu_type_get_type_size(ins->src_types[i]); - assert((sz == base_size) || (sz == base_size / 2)); + unsigned sz = nir_alu_type_get_type_size(ins->src_types[i]); + assert((sz == base_size) || (sz == base_size / 2)); - midgard_src_expand_mode expand_mode = midgard_src_passthrough; - unsigned swizzle = mir_pack_swizzle(ins->mask, ins->swizzle[i], - sz, base_size, channeled, - &expand_mode); + midgard_src_expand_mode expand_mode = midgard_src_passthrough; + unsigned swizzle = mir_pack_swizzle(ins->mask, ins->swizzle[i], sz, + base_size, channeled, &expand_mode); - midgard_vector_alu_src pack = { - .mod = mir_pack_mod(ins, i, false), - .expand_mode = expand_mode, - .swizzle = swizzle, - }; + midgard_vector_alu_src pack = { + .mod = mir_pack_mod(ins, i, false), + .expand_mode = expand_mode, + .swizzle = swizzle, + }; - unsigned p = vector_alu_srco_unsigned(pack); - - if (i == 0) - alu->src1 = p; - else - alu->src2 = p; - } + unsigned p = vector_alu_srco_unsigned(pack); + + if (i == 0) + alu->src1 = p; + else + alu->src2 = p; + } } static void mir_pack_swizzle_ldst(midgard_instruction *ins) { - unsigned compsz = OP_IS_STORE(ins->op) ? - nir_alu_type_get_type_size(ins->src_types[0]) : - nir_alu_type_get_type_size(ins->dest_type); - unsigned maxcomps = 128 / compsz; - unsigned step = DIV_ROUND_UP(32, compsz); + unsigned compsz = OP_IS_STORE(ins->op) + ? nir_alu_type_get_type_size(ins->src_types[0]) + : nir_alu_type_get_type_size(ins->dest_type); + unsigned maxcomps = 128 / compsz; + unsigned step = DIV_ROUND_UP(32, compsz); - for (unsigned c = 0; c < maxcomps; c += step) { - unsigned v = ins->swizzle[0][c]; + for (unsigned c = 0; c < maxcomps; c += step) { + unsigned v = ins->swizzle[0][c]; - /* Make sure the component index doesn't exceed the maximum - * number of components. */ - assert(v <= maxcomps); + /* Make sure the component index doesn't exceed the maximum + * number of components. */ + assert(v <= maxcomps); - if (compsz <= 32) - ins->load_store.swizzle |= (v / step) << (2 * (c / step)); - else - ins->load_store.swizzle |= ((v / step) << (4 * c)) | - (((v / step) + 1) << ((4 * c) + 2)); - } + if (compsz <= 32) + ins->load_store.swizzle |= (v / step) << (2 * (c / step)); + else + ins->load_store.swizzle |= + ((v / step) << (4 * c)) | (((v / step) + 1) << ((4 * c) + 2)); + } - /* TODO: arg_1/2 */ + /* TODO: arg_1/2 */ } static void mir_pack_swizzle_tex(midgard_instruction *ins) { - for (unsigned i = 0; i < 2; ++i) { - unsigned packed = 0; + for (unsigned i = 0; i < 2; ++i) { + unsigned packed = 0; - for (unsigned c = 0; c < 4; ++c) { - unsigned v = ins->swizzle[i][c]; + for (unsigned c = 0; c < 4; ++c) { + unsigned v = ins->swizzle[i][c]; - /* Check vec4 */ - assert(v <= 3); + /* Check vec4 */ + assert(v <= 3); - packed |= v << (2 * c); - } + packed |= v << (2 * c); + } - if (i == 0) - ins->texture.swizzle = packed; - else - ins->texture.in_reg_swizzle = packed; - } + if (i == 0) + ins->texture.swizzle = packed; + else + ins->texture.in_reg_swizzle = packed; + } - /* TODO: bias component */ + /* TODO: bias component */ } /* @@ -419,43 +414,45 @@ static bool mir_can_run_ooo(midgard_block *block, midgard_bundle *bundle, unsigned dependency) { - /* Don't read out of bounds */ - if (bundle >= (midgard_bundle *) ((char *) block->bundles.data + block->bundles.size)) - return false; + /* Don't read out of bounds */ + if (bundle >= + (midgard_bundle *)((char *)block->bundles.data + block->bundles.size)) + return false; - /* Texture ops can't execute with other texture ops */ - if (!IS_ALU(bundle->tag) && bundle->tag != TAG_LOAD_STORE_4) - return false; + /* Texture ops can't execute with other texture ops */ + if (!IS_ALU(bundle->tag) && bundle->tag != TAG_LOAD_STORE_4) + return false; - for (unsigned i = 0; i < bundle->instruction_count; ++i) { - midgard_instruction *ins = bundle->instructions[i]; + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; - /* No branches, jumps, or discards */ - if (ins->compact_branch) - return false; + /* No branches, jumps, or discards */ + if (ins->compact_branch) + return false; - /* No read-after-write data dependencies */ - mir_foreach_src(ins, s) { - if (ins->src[s] == dependency) - return false; - } - } + /* No read-after-write data dependencies */ + mir_foreach_src(ins, s) { + if (ins->src[s] == dependency) + return false; + } + } - /* Otherwise, we're okay */ - return true; + /* Otherwise, we're okay */ + return true; } static void -mir_pack_tex_ooo(midgard_block *block, midgard_bundle *bundle, midgard_instruction *ins) +mir_pack_tex_ooo(midgard_block *block, midgard_bundle *bundle, + midgard_instruction *ins) { - unsigned count = 0; + unsigned count = 0; - for (count = 0; count < 15; ++count) { - if (!mir_can_run_ooo(block, bundle + count + 1, ins->dest)) - break; - } + for (count = 0; count < 15; ++count) { + if (!mir_can_run_ooo(block, bundle + count + 1, ins->dest)) + break; + } - ins->texture.out_of_order = count; + ins->texture.out_of_order = count; } /* Load store masks are 4-bits. Load/store ops pack for that. @@ -466,113 +463,110 @@ mir_pack_tex_ooo(midgard_block *block, midgard_bundle *bundle, midgard_instructi */ static unsigned -midgard_pack_common_store_mask(midgard_instruction *ins) { - ASSERTED unsigned comp_sz = nir_alu_type_get_type_size(ins->src_types[0]); - unsigned bytemask = mir_bytemask(ins); - unsigned packed = 0; +midgard_pack_common_store_mask(midgard_instruction *ins) +{ + ASSERTED unsigned comp_sz = nir_alu_type_get_type_size(ins->src_types[0]); + unsigned bytemask = mir_bytemask(ins); + unsigned packed = 0; - switch (ins->op) { - case midgard_op_st_u8: - return mir_bytemask(ins) & 1; - case midgard_op_st_u16: - return mir_bytemask(ins) & 3; - case midgard_op_st_32: - return mir_bytemask(ins); - case midgard_op_st_64: - assert(comp_sz >= 16); - for (unsigned i = 0; i < 4; i++) { - if (bytemask & (3 << (i * 2))) - packed |= 1 << i; - } - return packed; - case midgard_op_st_128: - assert(comp_sz >= 32); - for (unsigned i = 0; i < 4; i++) { - if (bytemask & (0xf << (i * 4))) - packed |= 1 << i; - } - return packed; - default: - unreachable("unexpected ldst opcode"); - } + switch (ins->op) { + case midgard_op_st_u8: + return mir_bytemask(ins) & 1; + case midgard_op_st_u16: + return mir_bytemask(ins) & 3; + case midgard_op_st_32: + return mir_bytemask(ins); + case midgard_op_st_64: + assert(comp_sz >= 16); + for (unsigned i = 0; i < 4; i++) { + if (bytemask & (3 << (i * 2))) + packed |= 1 << i; + } + return packed; + case midgard_op_st_128: + assert(comp_sz >= 32); + for (unsigned i = 0; i < 4; i++) { + if (bytemask & (0xf << (i * 4))) + packed |= 1 << i; + } + return packed; + default: + unreachable("unexpected ldst opcode"); + } } static void mir_pack_ldst_mask(midgard_instruction *ins) { - unsigned sz = nir_alu_type_get_type_size(ins->dest_type); - unsigned packed = ins->mask; + unsigned sz = nir_alu_type_get_type_size(ins->dest_type); + unsigned packed = ins->mask; - if (OP_IS_COMMON_STORE(ins->op)) { - packed = midgard_pack_common_store_mask(ins); - } else { - if (sz == 64) { - packed = ((ins->mask & 0x2) ? (0x8 | 0x4) : 0) | - ((ins->mask & 0x1) ? (0x2 | 0x1) : 0); - } else if (sz < 32) { - unsigned comps_per_32b = 32 / sz; + if (OP_IS_COMMON_STORE(ins->op)) { + packed = midgard_pack_common_store_mask(ins); + } else { + if (sz == 64) { + packed = ((ins->mask & 0x2) ? (0x8 | 0x4) : 0) | + ((ins->mask & 0x1) ? (0x2 | 0x1) : 0); + } else if (sz < 32) { + unsigned comps_per_32b = 32 / sz; - packed = 0; + packed = 0; - for (unsigned i = 0; i < 4; ++i) { - unsigned submask = (ins->mask >> (i * comps_per_32b)) & - BITFIELD_MASK(comps_per_32b); + for (unsigned i = 0; i < 4; ++i) { + unsigned submask = (ins->mask >> (i * comps_per_32b)) & + BITFIELD_MASK(comps_per_32b); - /* Make sure we're duplicated */ - assert(submask == 0 || submask == BITFIELD_MASK(comps_per_32b)); - packed |= (submask != 0) << i; - } - } else { - assert(sz == 32); - } - } + /* Make sure we're duplicated */ + assert(submask == 0 || submask == BITFIELD_MASK(comps_per_32b)); + packed |= (submask != 0) << i; + } + } else { + assert(sz == 32); + } + } - ins->load_store.mask = packed; + ins->load_store.mask = packed; } static void mir_lower_inverts(midgard_instruction *ins) { - bool inv[3] = { - ins->src_invert[0], - ins->src_invert[1], - ins->src_invert[2] - }; + bool inv[3] = {ins->src_invert[0], ins->src_invert[1], ins->src_invert[2]}; - switch (ins->op) { - case midgard_alu_op_iand: - /* a & ~b = iandnot(a, b) */ - /* ~a & ~b = ~(a | b) = inor(a, b) */ + switch (ins->op) { + case midgard_alu_op_iand: + /* a & ~b = iandnot(a, b) */ + /* ~a & ~b = ~(a | b) = inor(a, b) */ - if (inv[0] && inv[1]) - ins->op = midgard_alu_op_inor; - else if (inv[1]) - ins->op = midgard_alu_op_iandnot; + if (inv[0] && inv[1]) + ins->op = midgard_alu_op_inor; + else if (inv[1]) + ins->op = midgard_alu_op_iandnot; - break; - case midgard_alu_op_ior: - /* a | ~b = iornot(a, b) */ - /* ~a | ~b = ~(a & b) = inand(a, b) */ + break; + case midgard_alu_op_ior: + /* a | ~b = iornot(a, b) */ + /* ~a | ~b = ~(a & b) = inand(a, b) */ - if (inv[0] && inv[1]) - ins->op = midgard_alu_op_inand; - else if (inv[1]) - ins->op = midgard_alu_op_iornot; + if (inv[0] && inv[1]) + ins->op = midgard_alu_op_inand; + else if (inv[1]) + ins->op = midgard_alu_op_iornot; - break; + break; - case midgard_alu_op_ixor: - /* ~a ^ b = a ^ ~b = ~(a ^ b) = inxor(a, b) */ - /* ~a ^ ~b = a ^ b */ + case midgard_alu_op_ixor: + /* ~a ^ b = a ^ ~b = ~(a ^ b) = inxor(a, b) */ + /* ~a ^ ~b = a ^ b */ - if (inv[0] ^ inv[1]) - ins->op = midgard_alu_op_inxor; + if (inv[0] ^ inv[1]) + ins->op = midgard_alu_op_inxor; - break; + break; - default: - break; - } + default: + break; + } } /* Opcodes with ROUNDS are the base (rte/0) type so we can just add */ @@ -580,329 +574,316 @@ mir_lower_inverts(midgard_instruction *ins) static void mir_lower_roundmode(midgard_instruction *ins) { - if (alu_opcode_props[ins->op].props & MIDGARD_ROUNDS) { - assert(ins->roundmode <= 0x3); - ins->op += ins->roundmode; - } + if (alu_opcode_props[ins->op].props & MIDGARD_ROUNDS) { + assert(ins->roundmode <= 0x3); + ins->op += ins->roundmode; + } } static midgard_load_store_word load_store_from_instr(midgard_instruction *ins) { - midgard_load_store_word ldst = ins->load_store; - ldst.op = ins->op; + midgard_load_store_word ldst = ins->load_store; + ldst.op = ins->op; - if (OP_IS_STORE(ldst.op)) { - ldst.reg = SSA_REG_FROM_FIXED(ins->src[0]) & 1; - } else { - ldst.reg = SSA_REG_FROM_FIXED(ins->dest); - } + if (OP_IS_STORE(ldst.op)) { + ldst.reg = SSA_REG_FROM_FIXED(ins->src[0]) & 1; + } else { + ldst.reg = SSA_REG_FROM_FIXED(ins->dest); + } - /* Atomic opcode swizzles have a special meaning: - * - The first two bits say which component of the implicit register should be used - * - The next two bits say if the implicit register is r26 or r27 */ - if (OP_IS_ATOMIC(ins->op)) { - ldst.swizzle = 0; - ldst.swizzle |= ins->swizzle[3][0] & 3; - ldst.swizzle |= (SSA_REG_FROM_FIXED(ins->src[3]) & 1 ? 1 : 0) << 2; - } + /* Atomic opcode swizzles have a special meaning: + * - The first two bits say which component of the implicit register should + * be used + * - The next two bits say if the implicit register is r26 or r27 */ + if (OP_IS_ATOMIC(ins->op)) { + ldst.swizzle = 0; + ldst.swizzle |= ins->swizzle[3][0] & 3; + ldst.swizzle |= (SSA_REG_FROM_FIXED(ins->src[3]) & 1 ? 1 : 0) << 2; + } - if (ins->src[1] != ~0) { - ldst.arg_reg = SSA_REG_FROM_FIXED(ins->src[1]) - REGISTER_LDST_BASE; - unsigned sz = nir_alu_type_get_type_size(ins->src_types[1]); - ldst.arg_comp = midgard_ldst_comp(ldst.arg_reg, ins->swizzle[1][0], sz); - } + if (ins->src[1] != ~0) { + ldst.arg_reg = SSA_REG_FROM_FIXED(ins->src[1]) - REGISTER_LDST_BASE; + unsigned sz = nir_alu_type_get_type_size(ins->src_types[1]); + ldst.arg_comp = midgard_ldst_comp(ldst.arg_reg, ins->swizzle[1][0], sz); + } - if (ins->src[2] != ~0) { - ldst.index_reg = SSA_REG_FROM_FIXED(ins->src[2]) - REGISTER_LDST_BASE; - unsigned sz = nir_alu_type_get_type_size(ins->src_types[2]); - ldst.index_comp = midgard_ldst_comp(ldst.index_reg, ins->swizzle[2][0], sz); - } + if (ins->src[2] != ~0) { + ldst.index_reg = SSA_REG_FROM_FIXED(ins->src[2]) - REGISTER_LDST_BASE; + unsigned sz = nir_alu_type_get_type_size(ins->src_types[2]); + ldst.index_comp = + midgard_ldst_comp(ldst.index_reg, ins->swizzle[2][0], sz); + } - return ldst; + return ldst; } static midgard_texture_word texture_word_from_instr(midgard_instruction *ins) { - midgard_texture_word tex = ins->texture; - tex.op = ins->op; + midgard_texture_word tex = ins->texture; + tex.op = ins->op; - unsigned src1 = ins->src[1] == ~0 ? REGISTER_UNUSED : SSA_REG_FROM_FIXED(ins->src[1]); - tex.in_reg_select = src1 & 1; + unsigned src1 = + ins->src[1] == ~0 ? REGISTER_UNUSED : SSA_REG_FROM_FIXED(ins->src[1]); + tex.in_reg_select = src1 & 1; - unsigned dest = ins->dest == ~0 ? REGISTER_UNUSED : SSA_REG_FROM_FIXED(ins->dest); - tex.out_reg_select = dest & 1; + unsigned dest = + ins->dest == ~0 ? REGISTER_UNUSED : SSA_REG_FROM_FIXED(ins->dest); + tex.out_reg_select = dest & 1; - if (ins->src[2] != ~0) { - midgard_tex_register_select sel = { - .select = SSA_REG_FROM_FIXED(ins->src[2]) & 1, - .full = 1, - .component = ins->swizzle[2][0], - }; - uint8_t packed; - memcpy(&packed, &sel, sizeof(packed)); - tex.bias = packed; - } + if (ins->src[2] != ~0) { + midgard_tex_register_select sel = { + .select = SSA_REG_FROM_FIXED(ins->src[2]) & 1, + .full = 1, + .component = ins->swizzle[2][0], + }; + uint8_t packed; + memcpy(&packed, &sel, sizeof(packed)); + tex.bias = packed; + } - if (ins->src[3] != ~0) { - unsigned x = ins->swizzle[3][0]; - unsigned y = x + 1; - unsigned z = x + 2; + if (ins->src[3] != ~0) { + unsigned x = ins->swizzle[3][0]; + unsigned y = x + 1; + unsigned z = x + 2; - /* Check range, TODO: half-registers */ - assert(z < 4); + /* Check range, TODO: half-registers */ + assert(z < 4); - unsigned offset_reg = SSA_REG_FROM_FIXED(ins->src[3]); - tex.offset = - (1) | /* full */ - (offset_reg & 1) << 1 | /* select */ - (0 << 2) | /* upper */ - (x << 3) | /* swizzle */ - (y << 5) | /* swizzle */ - (z << 7); /* swizzle */ - } + unsigned offset_reg = SSA_REG_FROM_FIXED(ins->src[3]); + tex.offset = (1) | /* full */ + (offset_reg & 1) << 1 | /* select */ + (0 << 2) | /* upper */ + (x << 3) | /* swizzle */ + (y << 5) | /* swizzle */ + (z << 7); /* swizzle */ + } - return tex; + return tex; } static midgard_vector_alu vector_alu_from_instr(midgard_instruction *ins) { - midgard_vector_alu alu = { - .op = ins->op, - .outmod = ins->outmod, - .reg_mode = reg_mode_for_bitsize(max_bitsize_for_alu(ins)), - }; + midgard_vector_alu alu = { + .op = ins->op, + .outmod = ins->outmod, + .reg_mode = reg_mode_for_bitsize(max_bitsize_for_alu(ins)), + }; - if (ins->has_inline_constant) { - /* Encode inline 16-bit constant. See disassembler for - * where the algorithm is from */ + if (ins->has_inline_constant) { + /* Encode inline 16-bit constant. See disassembler for + * where the algorithm is from */ - int lower_11 = ins->inline_constant & ((1 << 12) - 1); - uint16_t imm = ((lower_11 >> 8) & 0x7) | - ((lower_11 & 0xFF) << 3); + int lower_11 = ins->inline_constant & ((1 << 12) - 1); + uint16_t imm = ((lower_11 >> 8) & 0x7) | ((lower_11 & 0xFF) << 3); - alu.src2 = imm << 2; - } + alu.src2 = imm << 2; + } - return alu; + return alu; } static midgard_branch_extended -midgard_create_branch_extended( midgard_condition cond, - midgard_jmp_writeout_op op, - unsigned dest_tag, - signed quadword_offset) +midgard_create_branch_extended(midgard_condition cond, + midgard_jmp_writeout_op op, unsigned dest_tag, + signed quadword_offset) { - /* The condition code is actually a LUT describing a function to - * combine multiple condition codes. However, we only support a single - * condition code at the moment, so we just duplicate over a bunch of - * times. */ + /* The condition code is actually a LUT describing a function to + * combine multiple condition codes. However, we only support a single + * condition code at the moment, so we just duplicate over a bunch of + * times. */ - uint16_t duplicated_cond = - (cond << 14) | - (cond << 12) | - (cond << 10) | - (cond << 8) | - (cond << 6) | - (cond << 4) | - (cond << 2) | - (cond << 0); + uint16_t duplicated_cond = (cond << 14) | (cond << 12) | (cond << 10) | + (cond << 8) | (cond << 6) | (cond << 4) | + (cond << 2) | (cond << 0); - midgard_branch_extended branch = { - .op = op, - .dest_tag = dest_tag, - .offset = quadword_offset, - .cond = duplicated_cond, - }; + midgard_branch_extended branch = { + .op = op, + .dest_tag = dest_tag, + .offset = quadword_offset, + .cond = duplicated_cond, + }; - return branch; + return branch; } static void -emit_branch(midgard_instruction *ins, - compiler_context *ctx, - midgard_block *block, - midgard_bundle *bundle, +emit_branch(midgard_instruction *ins, compiler_context *ctx, + midgard_block *block, midgard_bundle *bundle, struct util_dynarray *emission) { - /* Parse some basic branch info */ - bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT; - bool is_conditional = ins->branch.conditional; - bool is_inverted = ins->branch.invert_conditional; - bool is_discard = ins->branch.target_type == TARGET_DISCARD; - bool is_tilebuf_wait = ins->branch.target_type == TARGET_TILEBUF_WAIT; - bool is_special = is_discard || is_tilebuf_wait; - bool is_writeout = ins->writeout; + /* Parse some basic branch info */ + bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT; + bool is_conditional = ins->branch.conditional; + bool is_inverted = ins->branch.invert_conditional; + bool is_discard = ins->branch.target_type == TARGET_DISCARD; + bool is_tilebuf_wait = ins->branch.target_type == TARGET_TILEBUF_WAIT; + bool is_special = is_discard || is_tilebuf_wait; + bool is_writeout = ins->writeout; - /* Determine the block we're jumping to */ - int target_number = ins->branch.target_block; + /* Determine the block we're jumping to */ + int target_number = ins->branch.target_block; - /* Report the destination tag */ - int dest_tag = is_discard ? 0 : - is_tilebuf_wait ? bundle->tag : - midgard_get_first_tag_from_block(ctx, target_number); + /* Report the destination tag */ + int dest_tag = is_discard ? 0 + : is_tilebuf_wait + ? bundle->tag + : midgard_get_first_tag_from_block(ctx, target_number); - /* Count up the number of quadwords we're - * jumping over = number of quadwords until - * (br_block_idx, target_number) */ + /* Count up the number of quadwords we're + * jumping over = number of quadwords until + * (br_block_idx, target_number) */ - int quadword_offset = 0; + int quadword_offset = 0; - if (is_discard) { - /* Fixed encoding, not actually an offset */ - quadword_offset = 0x2; - } else if (is_tilebuf_wait) { - quadword_offset = -1; - } else if (target_number > block->base.name) { - /* Jump forward */ + if (is_discard) { + /* Fixed encoding, not actually an offset */ + quadword_offset = 0x2; + } else if (is_tilebuf_wait) { + quadword_offset = -1; + } else if (target_number > block->base.name) { + /* Jump forward */ - for (int idx = block->base.name+1; idx < target_number; ++idx) { - midgard_block *blk = mir_get_block(ctx, idx); - assert(blk); + for (int idx = block->base.name + 1; idx < target_number; ++idx) { + midgard_block *blk = mir_get_block(ctx, idx); + assert(blk); - quadword_offset += blk->quadword_count; - } - } else { - /* Jump backwards */ + quadword_offset += blk->quadword_count; + } + } else { + /* Jump backwards */ - for (int idx = block->base.name; idx >= target_number; --idx) { - midgard_block *blk = mir_get_block(ctx, idx); - assert(blk); + for (int idx = block->base.name; idx >= target_number; --idx) { + midgard_block *blk = mir_get_block(ctx, idx); + assert(blk); - quadword_offset -= blk->quadword_count; - } - } + quadword_offset -= blk->quadword_count; + } + } - /* Unconditional extended branches (far jumps) - * have issues, so we always use a conditional - * branch, setting the condition to always for - * unconditional. For compact unconditional - * branches, cond isn't used so it doesn't - * matter what we pick. */ + /* Unconditional extended branches (far jumps) + * have issues, so we always use a conditional + * branch, setting the condition to always for + * unconditional. For compact unconditional + * branches, cond isn't used so it doesn't + * matter what we pick. */ - midgard_condition cond = - !is_conditional ? midgard_condition_always : - is_inverted ? midgard_condition_false : - midgard_condition_true; + midgard_condition cond = !is_conditional ? midgard_condition_always + : is_inverted ? midgard_condition_false + : midgard_condition_true; - midgard_jmp_writeout_op op = - is_discard ? midgard_jmp_writeout_op_discard : - is_tilebuf_wait ? midgard_jmp_writeout_op_tilebuffer_pending : - is_writeout ? midgard_jmp_writeout_op_writeout : - (is_compact && !is_conditional) ? - midgard_jmp_writeout_op_branch_uncond : - midgard_jmp_writeout_op_branch_cond; + midgard_jmp_writeout_op op = + is_discard ? midgard_jmp_writeout_op_discard + : is_tilebuf_wait ? midgard_jmp_writeout_op_tilebuffer_pending + : is_writeout ? midgard_jmp_writeout_op_writeout + : (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond + : midgard_jmp_writeout_op_branch_cond; - if (is_compact) { - unsigned size = sizeof(midgard_branch_cond); + if (is_compact) { + unsigned size = sizeof(midgard_branch_cond); - if (is_conditional || is_special) { - midgard_branch_cond branch = { - .op = op, - .dest_tag = dest_tag, - .offset = quadword_offset, - .cond = cond, - }; - memcpy(util_dynarray_grow_bytes(emission, size, 1), &branch, size); - } else { - assert(op == midgard_jmp_writeout_op_branch_uncond); - midgard_branch_uncond branch = { - .op = op, - .dest_tag = dest_tag, - .offset = quadword_offset, - .call_mode = midgard_call_mode_default, - }; - assert(branch.offset == quadword_offset); - memcpy(util_dynarray_grow_bytes(emission, size, 1), &branch, size); - } - } else { /* `ins->compact_branch`, misnomer */ - unsigned size = sizeof(midgard_branch_extended); + if (is_conditional || is_special) { + midgard_branch_cond branch = { + .op = op, + .dest_tag = dest_tag, + .offset = quadword_offset, + .cond = cond, + }; + memcpy(util_dynarray_grow_bytes(emission, size, 1), &branch, size); + } else { + assert(op == midgard_jmp_writeout_op_branch_uncond); + midgard_branch_uncond branch = { + .op = op, + .dest_tag = dest_tag, + .offset = quadword_offset, + .call_mode = midgard_call_mode_default, + }; + assert(branch.offset == quadword_offset); + memcpy(util_dynarray_grow_bytes(emission, size, 1), &branch, size); + } + } else { /* `ins->compact_branch`, misnomer */ + unsigned size = sizeof(midgard_branch_extended); - midgard_branch_extended branch = - midgard_create_branch_extended( - cond, op, - dest_tag, - quadword_offset); + midgard_branch_extended branch = + midgard_create_branch_extended(cond, op, dest_tag, quadword_offset); - memcpy(util_dynarray_grow_bytes(emission, size, 1), &branch, size); - } + memcpy(util_dynarray_grow_bytes(emission, size, 1), &branch, size); + } } static void -emit_alu_bundle(compiler_context *ctx, - midgard_block *block, - midgard_bundle *bundle, - struct util_dynarray *emission, +emit_alu_bundle(compiler_context *ctx, midgard_block *block, + midgard_bundle *bundle, struct util_dynarray *emission, unsigned lookahead) { - /* Emit the control word */ - util_dynarray_append(emission, uint32_t, bundle->control | lookahead); + /* Emit the control word */ + util_dynarray_append(emission, uint32_t, bundle->control | lookahead); - /* Next up, emit register words */ - for (unsigned i = 0; i < bundle->instruction_count; ++i) { - midgard_instruction *ins = bundle->instructions[i]; + /* Next up, emit register words */ + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; - /* Check if this instruction has registers */ - if (ins->compact_branch) continue; + /* Check if this instruction has registers */ + if (ins->compact_branch) + continue; - unsigned src2_reg = REGISTER_UNUSED; - if (ins->has_inline_constant) - src2_reg = ins->inline_constant >> 11; - else if (ins->src[1] != ~0) - src2_reg = SSA_REG_FROM_FIXED(ins->src[1]); + unsigned src2_reg = REGISTER_UNUSED; + if (ins->has_inline_constant) + src2_reg = ins->inline_constant >> 11; + else if (ins->src[1] != ~0) + src2_reg = SSA_REG_FROM_FIXED(ins->src[1]); - /* Otherwise, just emit the registers */ - uint16_t reg_word = 0; - midgard_reg_info registers = { - .src1_reg = (ins->src[0] == ~0 ? - REGISTER_UNUSED : - SSA_REG_FROM_FIXED(ins->src[0])), - .src2_reg = src2_reg, - .src2_imm = ins->has_inline_constant, - .out_reg = (ins->dest == ~0 ? - REGISTER_UNUSED : - SSA_REG_FROM_FIXED(ins->dest)), - }; - memcpy(®_word, ®isters, sizeof(uint16_t)); - util_dynarray_append(emission, uint16_t, reg_word); - } + /* Otherwise, just emit the registers */ + uint16_t reg_word = 0; + midgard_reg_info registers = { + .src1_reg = (ins->src[0] == ~0 ? REGISTER_UNUSED + : SSA_REG_FROM_FIXED(ins->src[0])), + .src2_reg = src2_reg, + .src2_imm = ins->has_inline_constant, + .out_reg = + (ins->dest == ~0 ? REGISTER_UNUSED : SSA_REG_FROM_FIXED(ins->dest)), + }; + memcpy(®_word, ®isters, sizeof(uint16_t)); + util_dynarray_append(emission, uint16_t, reg_word); + } - /* Now, we emit the body itself */ - for (unsigned i = 0; i < bundle->instruction_count; ++i) { - midgard_instruction *ins = bundle->instructions[i]; + /* Now, we emit the body itself */ + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; - if (!ins->compact_branch) { - mir_lower_inverts(ins); - mir_lower_roundmode(ins); - } + if (!ins->compact_branch) { + mir_lower_inverts(ins); + mir_lower_roundmode(ins); + } - if (midgard_is_branch_unit(ins->unit)) { - emit_branch(ins, ctx, block, bundle, emission); - } else if (ins->unit & UNITS_ANY_VECTOR) { - midgard_vector_alu source = vector_alu_from_instr(ins); - mir_pack_mask_alu(ins, &source); - mir_pack_vector_srcs(ins, &source); - unsigned size = sizeof(source); - memcpy(util_dynarray_grow_bytes(emission, size, 1), &source, size); - } else { - midgard_scalar_alu source = vector_to_scalar_alu(vector_alu_from_instr(ins), ins); - unsigned size = sizeof(source); - memcpy(util_dynarray_grow_bytes(emission, size, 1), &source, size); - } - } + if (midgard_is_branch_unit(ins->unit)) { + emit_branch(ins, ctx, block, bundle, emission); + } else if (ins->unit & UNITS_ANY_VECTOR) { + midgard_vector_alu source = vector_alu_from_instr(ins); + mir_pack_mask_alu(ins, &source); + mir_pack_vector_srcs(ins, &source); + unsigned size = sizeof(source); + memcpy(util_dynarray_grow_bytes(emission, size, 1), &source, size); + } else { + midgard_scalar_alu source = + vector_to_scalar_alu(vector_alu_from_instr(ins), ins); + unsigned size = sizeof(source); + memcpy(util_dynarray_grow_bytes(emission, size, 1), &source, size); + } + } - /* Emit padding (all zero) */ - if (bundle->padding) { - memset(util_dynarray_grow_bytes(emission, bundle->padding, 1), - 0, bundle->padding); - } + /* Emit padding (all zero) */ + if (bundle->padding) { + memset(util_dynarray_grow_bytes(emission, bundle->padding, 1), 0, + bundle->padding); + } - /* Tack on constants */ + /* Tack on constants */ - if (bundle->has_embedded_constants) - util_dynarray_append(emission, midgard_constants, bundle->constants); + if (bundle->has_embedded_constants) + util_dynarray_append(emission, midgard_constants, bundle->constants); } /* Shift applied to the immediate used as an offset. Probably this is papering @@ -912,158 +893,153 @@ emit_alu_bundle(compiler_context *ctx, static void mir_ldst_pack_offset(midgard_instruction *ins, int offset) { - /* These opcodes don't support offsets */ - assert(!OP_IS_REG2REG_LDST(ins->op) || - ins->op == midgard_op_lea || - ins->op == midgard_op_lea_image); + /* These opcodes don't support offsets */ + assert(!OP_IS_REG2REG_LDST(ins->op) || ins->op == midgard_op_lea || + ins->op == midgard_op_lea_image); - if (OP_IS_UBO_READ(ins->op)) - ins->load_store.signed_offset |= PACK_LDST_UBO_OFS(offset); - else if (OP_IS_IMAGE(ins->op)) - ins->load_store.signed_offset |= PACK_LDST_ATTRIB_OFS(offset); - else if (OP_IS_SPECIAL(ins->op)) - ins->load_store.signed_offset |= PACK_LDST_SELECTOR_OFS(offset); - else - ins->load_store.signed_offset |= PACK_LDST_MEM_OFS(offset); + if (OP_IS_UBO_READ(ins->op)) + ins->load_store.signed_offset |= PACK_LDST_UBO_OFS(offset); + else if (OP_IS_IMAGE(ins->op)) + ins->load_store.signed_offset |= PACK_LDST_ATTRIB_OFS(offset); + else if (OP_IS_SPECIAL(ins->op)) + ins->load_store.signed_offset |= PACK_LDST_SELECTOR_OFS(offset); + else + ins->load_store.signed_offset |= PACK_LDST_MEM_OFS(offset); } static enum mali_sampler_type -midgard_sampler_type(nir_alu_type t) { - switch (nir_alu_type_get_base_type(t)) - { - case nir_type_float: - return MALI_SAMPLER_FLOAT; - case nir_type_int: - return MALI_SAMPLER_SIGNED; - case nir_type_uint: - return MALI_SAMPLER_UNSIGNED; - default: - unreachable("Unknown sampler type"); - } +midgard_sampler_type(nir_alu_type t) +{ + switch (nir_alu_type_get_base_type(t)) { + case nir_type_float: + return MALI_SAMPLER_FLOAT; + case nir_type_int: + return MALI_SAMPLER_SIGNED; + case nir_type_uint: + return MALI_SAMPLER_UNSIGNED; + default: + unreachable("Unknown sampler type"); + } } /* After everything is scheduled, emit whole bundles at a time */ void -emit_binary_bundle(compiler_context *ctx, - midgard_block *block, - midgard_bundle *bundle, - struct util_dynarray *emission, +emit_binary_bundle(compiler_context *ctx, midgard_block *block, + midgard_bundle *bundle, struct util_dynarray *emission, int next_tag) { - int lookahead = next_tag << 4; + int lookahead = next_tag << 4; - switch (bundle->tag) { - case TAG_ALU_4: - case TAG_ALU_8: - case TAG_ALU_12: - case TAG_ALU_16: - case TAG_ALU_4 + 4: - case TAG_ALU_8 + 4: - case TAG_ALU_12 + 4: - case TAG_ALU_16 + 4: - emit_alu_bundle(ctx, block, bundle, emission, lookahead); - break; + switch (bundle->tag) { + case TAG_ALU_4: + case TAG_ALU_8: + case TAG_ALU_12: + case TAG_ALU_16: + case TAG_ALU_4 + 4: + case TAG_ALU_8 + 4: + case TAG_ALU_12 + 4: + case TAG_ALU_16 + 4: + emit_alu_bundle(ctx, block, bundle, emission, lookahead); + break; - case TAG_LOAD_STORE_4: { - /* One or two composing instructions */ + case TAG_LOAD_STORE_4: { + /* One or two composing instructions */ - uint64_t current64, next64 = LDST_NOP; + uint64_t current64, next64 = LDST_NOP; - /* Copy masks */ + /* Copy masks */ - for (unsigned i = 0; i < bundle->instruction_count; ++i) { - midgard_instruction *ins = bundle->instructions[i]; - mir_pack_ldst_mask(ins); + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; + mir_pack_ldst_mask(ins); - /* Atomic ops don't use this swizzle the same way as other ops */ - if (!OP_IS_ATOMIC(ins->op)) - mir_pack_swizzle_ldst(ins); + /* Atomic ops don't use this swizzle the same way as other ops */ + if (!OP_IS_ATOMIC(ins->op)) + mir_pack_swizzle_ldst(ins); - /* Apply a constant offset */ - unsigned offset = ins->constants.u32[0]; - if (offset) - mir_ldst_pack_offset(ins, offset); - } + /* Apply a constant offset */ + unsigned offset = ins->constants.u32[0]; + if (offset) + mir_ldst_pack_offset(ins, offset); + } - midgard_load_store_word ldst0 = - load_store_from_instr(bundle->instructions[0]); - memcpy(¤t64, &ldst0, sizeof(current64)); + midgard_load_store_word ldst0 = + load_store_from_instr(bundle->instructions[0]); + memcpy(¤t64, &ldst0, sizeof(current64)); - if (bundle->instruction_count == 2) { - midgard_load_store_word ldst1 = - load_store_from_instr(bundle->instructions[1]); - memcpy(&next64, &ldst1, sizeof(next64)); - } + if (bundle->instruction_count == 2) { + midgard_load_store_word ldst1 = + load_store_from_instr(bundle->instructions[1]); + memcpy(&next64, &ldst1, sizeof(next64)); + } - midgard_load_store instruction = { - .type = bundle->tag, - .next_type = next_tag, - .word1 = current64, - .word2 = next64, - }; + midgard_load_store instruction = { + .type = bundle->tag, + .next_type = next_tag, + .word1 = current64, + .word2 = next64, + }; - util_dynarray_append(emission, midgard_load_store, instruction); + util_dynarray_append(emission, midgard_load_store, instruction); - break; - } + break; + } - case TAG_TEXTURE_4: - case TAG_TEXTURE_4_VTX: - case TAG_TEXTURE_4_BARRIER: { - /* Texture instructions are easy, since there is no pipelining - * nor VLIW to worry about. We may need to set .cont/.last - * flags. */ + case TAG_TEXTURE_4: + case TAG_TEXTURE_4_VTX: + case TAG_TEXTURE_4_BARRIER: { + /* Texture instructions are easy, since there is no pipelining + * nor VLIW to worry about. We may need to set .cont/.last + * flags. */ - midgard_instruction *ins = bundle->instructions[0]; + midgard_instruction *ins = bundle->instructions[0]; - ins->texture.type = bundle->tag; - ins->texture.next_type = next_tag; - ins->texture.exec = MIDGARD_PARTIAL_EXECUTION_NONE; /* default */ + ins->texture.type = bundle->tag; + ins->texture.next_type = next_tag; + ins->texture.exec = MIDGARD_PARTIAL_EXECUTION_NONE; /* default */ - /* Nothing else to pack for barriers */ - if (ins->op == midgard_tex_op_barrier) { - ins->texture.op = ins->op; - util_dynarray_append(emission, midgard_texture_word, ins->texture); - return; - } + /* Nothing else to pack for barriers */ + if (ins->op == midgard_tex_op_barrier) { + ins->texture.op = ins->op; + util_dynarray_append(emission, midgard_texture_word, ins->texture); + return; + } - signed override = mir_upper_override(ins, 32); + signed override = mir_upper_override(ins, 32); - ins->texture.mask = override > 0 ? - ins->mask >> override : - ins->mask; + ins->texture.mask = override > 0 ? ins->mask >> override : ins->mask; - mir_pack_swizzle_tex(ins); + mir_pack_swizzle_tex(ins); - if (!(ctx->quirks & MIDGARD_NO_OOO)) - mir_pack_tex_ooo(block, bundle, ins); + if (!(ctx->quirks & MIDGARD_NO_OOO)) + mir_pack_tex_ooo(block, bundle, ins); - unsigned osz = nir_alu_type_get_type_size(ins->dest_type); - unsigned isz = nir_alu_type_get_type_size(ins->src_types[1]); + unsigned osz = nir_alu_type_get_type_size(ins->dest_type); + unsigned isz = nir_alu_type_get_type_size(ins->src_types[1]); - assert(osz == 32 || osz == 16); - assert(isz == 32 || isz == 16); + assert(osz == 32 || osz == 16); + assert(isz == 32 || isz == 16); - ins->texture.out_full = (osz == 32); - ins->texture.out_upper = override > 0; - ins->texture.in_reg_full = (isz == 32); - ins->texture.sampler_type = midgard_sampler_type(ins->dest_type); - ins->texture.outmod = ins->outmod; + ins->texture.out_full = (osz == 32); + ins->texture.out_upper = override > 0; + ins->texture.in_reg_full = (isz == 32); + ins->texture.sampler_type = midgard_sampler_type(ins->dest_type); + ins->texture.outmod = ins->outmod; - if (mir_op_computes_derivatives(ctx->stage, ins->op)) { - if (ins->helper_terminate) - ins->texture.exec = MIDGARD_PARTIAL_EXECUTION_KILL; - else if (!ins->helper_execute) - ins->texture.exec = MIDGARD_PARTIAL_EXECUTION_SKIP; - } + if (mir_op_computes_derivatives(ctx->stage, ins->op)) { + if (ins->helper_terminate) + ins->texture.exec = MIDGARD_PARTIAL_EXECUTION_KILL; + else if (!ins->helper_execute) + ins->texture.exec = MIDGARD_PARTIAL_EXECUTION_SKIP; + } - midgard_texture_word texture = texture_word_from_instr(ins); - util_dynarray_append(emission, midgard_texture_word, texture); - break; - } + midgard_texture_word texture = texture_word_from_instr(ins); + util_dynarray_append(emission, midgard_texture_word, texture); + break; + } - default: - unreachable("Unknown midgard instruction type\n"); - } + default: + unreachable("Unknown midgard instruction type\n"); + } } diff --git a/src/panfrost/midgard/midgard_errata_lod.c b/src/panfrost/midgard/midgard_errata_lod.c index 395d8c1b388..d4f7eb203af 100644 --- a/src/panfrost/midgard/midgard_errata_lod.c +++ b/src/panfrost/midgard/midgard_errata_lod.c @@ -35,57 +35,55 @@ bool midgard_nir_lod_errata(nir_shader *shader); static bool nir_lod_errata_instr(nir_builder *b, nir_instr *instr, void *data) { - if (instr->type != nir_instr_type_tex) - return false; + if (instr->type != nir_instr_type_tex) + return false; - nir_tex_instr *tex = nir_instr_as_tex(instr); - b->cursor = nir_before_instr(instr); + nir_tex_instr *tex = nir_instr_as_tex(instr); + b->cursor = nir_before_instr(instr); - /* The errata only applies to textureLod ("TEXGRD") */ - if (tex->op != nir_texop_txl) - return false; + /* The errata only applies to textureLod ("TEXGRD") */ + if (tex->op != nir_texop_txl) + return false; - /* Let's grab the sampler parameters */ - nir_intrinsic_instr *l = nir_intrinsic_instr_create(b->shader, - nir_intrinsic_load_sampler_lod_parameters_pan); - l->num_components = 3; - nir_ssa_dest_init(&l->instr, &l->dest, 3, 32, NULL); + /* Let's grab the sampler parameters */ + nir_intrinsic_instr *l = nir_intrinsic_instr_create( + b->shader, nir_intrinsic_load_sampler_lod_parameters_pan); + l->num_components = 3; + nir_ssa_dest_init(&l->instr, &l->dest, 3, 32, NULL); - /* TODO: Indirect samplers, separate sampler objects XXX */ - nir_src idx = nir_src_for_ssa(nir_imm_int(b, tex->texture_index)); - nir_src_copy(&l->src[0], &idx, &l->instr); + /* TODO: Indirect samplers, separate sampler objects XXX */ + nir_src idx = nir_src_for_ssa(nir_imm_int(b, tex->texture_index)); + nir_src_copy(&l->src[0], &idx, &l->instr); - nir_builder_instr_insert(b, &l->instr); - nir_ssa_def *params = &l->dest.ssa; + nir_builder_instr_insert(b, &l->instr); + nir_ssa_def *params = &l->dest.ssa; - /* Extract the individual components */ - nir_ssa_def *min_lod = nir_channel(b, params, 0); - nir_ssa_def *max_lod = nir_channel(b, params, 1); - nir_ssa_def *lod_bias = nir_channel(b, params, 2); + /* Extract the individual components */ + nir_ssa_def *min_lod = nir_channel(b, params, 0); + nir_ssa_def *max_lod = nir_channel(b, params, 1); + nir_ssa_def *lod_bias = nir_channel(b, params, 2); - /* Rewrite the LOD with bias/clamps. Order sensitive. */ - for (unsigned i = 0; i < tex->num_srcs; i++) { - if (tex->src[i].src_type != nir_tex_src_lod) - continue; + /* Rewrite the LOD with bias/clamps. Order sensitive. */ + for (unsigned i = 0; i < tex->num_srcs; i++) { + if (tex->src[i].src_type != nir_tex_src_lod) + continue; - nir_ssa_def *lod = nir_ssa_for_src(b, tex->src[i].src, 1); + nir_ssa_def *lod = nir_ssa_for_src(b, tex->src[i].src, 1); - nir_ssa_def *biased = nir_fadd(b, lod, lod_bias); - nir_ssa_def *clamped = nir_fmin(b, - nir_fmax(b, biased, min_lod), max_lod); + nir_ssa_def *biased = nir_fadd(b, lod, lod_bias); + nir_ssa_def *clamped = nir_fmin(b, nir_fmax(b, biased, min_lod), max_lod); - nir_instr_rewrite_src(&tex->instr, &tex->src[i].src, - nir_src_for_ssa(clamped)); - } + nir_instr_rewrite_src(&tex->instr, &tex->src[i].src, + nir_src_for_ssa(clamped)); + } - return true; + return true; } bool midgard_nir_lod_errata(nir_shader *shader) { - return nir_shader_instructions_pass(shader, - nir_lod_errata_instr, - nir_metadata_block_index | nir_metadata_dominance, - NULL); + return nir_shader_instructions_pass( + shader, nir_lod_errata_instr, + nir_metadata_block_index | nir_metadata_dominance, NULL); } diff --git a/src/panfrost/midgard/midgard_helper_invocations.c b/src/panfrost/midgard/midgard_helper_invocations.c index 407de6676b9..0321c2b4ba1 100644 --- a/src/panfrost/midgard/midgard_helper_invocations.c +++ b/src/panfrost/midgard/midgard_helper_invocations.c @@ -66,182 +66,188 @@ static bool mir_block_uses_helpers(gl_shader_stage stage, midgard_block *block) { - mir_foreach_instr_in_block(block, ins) { - if (ins->type != TAG_TEXTURE_4) continue; - if (mir_op_computes_derivatives(stage, ins->op)) - return true; - } + mir_foreach_instr_in_block(block, ins) { + if (ins->type != TAG_TEXTURE_4) + continue; + if (mir_op_computes_derivatives(stage, ins->op)) + return true; + } - return false; + return false; } static bool mir_block_terminates_helpers(midgard_block *block) { - /* Can't terminate if there are no helpers */ - if (!block->helpers_in) - return false; + /* Can't terminate if there are no helpers */ + if (!block->helpers_in) + return false; - /* Can't terminate if a successor needs helpers */ - pan_foreach_successor((&block->base), succ) { - if (((midgard_block *) succ)->helpers_in) - return false; - } + /* Can't terminate if a successor needs helpers */ + pan_foreach_successor((&block->base), succ) { + if (((midgard_block *)succ)->helpers_in) + return false; + } - /* Otherwise we terminate */ - return true; + /* Otherwise we terminate */ + return true; } void mir_analyze_helper_terminate(compiler_context *ctx) { - /* Set blocks as directly requiring helpers, and if they do add them to - * the worklist to propagate to their predecessors */ + /* Set blocks as directly requiring helpers, and if they do add them to + * the worklist to propagate to their predecessors */ - struct set *worklist = _mesa_set_create(NULL, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + struct set *worklist = + _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - struct set *visited = _mesa_set_create(NULL, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + struct set *visited = + _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - block->helpers_in |= mir_block_uses_helpers(ctx->stage, block); + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + block->helpers_in |= mir_block_uses_helpers(ctx->stage, block); - if (block->helpers_in) - _mesa_set_add(worklist, _block); - } + if (block->helpers_in) + _mesa_set_add(worklist, _block); + } - /* Next, propagate back. Since there are a finite number of blocks, the - * worklist (a subset of all the blocks) is finite. Since a block can - * only be added to the worklist if it is not on the visited list and - * the visited list - also a subset of the blocks - grows every - * iteration, the algorithm must terminate. */ + /* Next, propagate back. Since there are a finite number of blocks, the + * worklist (a subset of all the blocks) is finite. Since a block can + * only be added to the worklist if it is not on the visited list and + * the visited list - also a subset of the blocks - grows every + * iteration, the algorithm must terminate. */ - struct set_entry *cur; + struct set_entry *cur; - while((cur = _mesa_set_next_entry(worklist, NULL)) != NULL) { - /* Pop off a block requiring helpers */ - pan_block *blk = (struct pan_block *) cur->key; - _mesa_set_remove(worklist, cur); + while ((cur = _mesa_set_next_entry(worklist, NULL)) != NULL) { + /* Pop off a block requiring helpers */ + pan_block *blk = (struct pan_block *)cur->key; + _mesa_set_remove(worklist, cur); - /* Its predecessors also require helpers */ - pan_foreach_predecessor(blk, pred) { - if (!_mesa_set_search(visited, pred)) { - ((midgard_block *) pred)->helpers_in = true; - _mesa_set_add(worklist, pred); - } - } - - _mesa_set_add(visited, blk); - } + /* Its predecessors also require helpers */ + pan_foreach_predecessor(blk, pred) { + if (!_mesa_set_search(visited, pred)) { + ((midgard_block *)pred)->helpers_in = true; + _mesa_set_add(worklist, pred); + } + } - _mesa_set_destroy(visited, NULL); - _mesa_set_destroy(worklist, NULL); + _mesa_set_add(visited, blk); + } - /* Finally, set helper_terminate on the last derivative-calculating - * instruction in a block that terminates helpers */ - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; + _mesa_set_destroy(visited, NULL); + _mesa_set_destroy(worklist, NULL); - if (!mir_block_terminates_helpers(block)) - continue; + /* Finally, set helper_terminate on the last derivative-calculating + * instruction in a block that terminates helpers */ + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; - mir_foreach_instr_in_block_rev(block, ins) { - if (ins->type != TAG_TEXTURE_4) continue; - if (!mir_op_computes_derivatives(ctx->stage, ins->op)) continue; + if (!mir_block_terminates_helpers(block)) + continue; - ins->helper_terminate = true; - break; - } - } + mir_foreach_instr_in_block_rev(block, ins) { + if (ins->type != TAG_TEXTURE_4) + continue; + if (!mir_op_computes_derivatives(ctx->stage, ins->op)) + continue; + + ins->helper_terminate = true; + break; + } + } } static bool -mir_helper_block_update(BITSET_WORD *deps, pan_block *_block, unsigned temp_count) +mir_helper_block_update(BITSET_WORD *deps, pan_block *_block, + unsigned temp_count) { - bool progress = false; - midgard_block *block = (midgard_block *) _block; + bool progress = false; + midgard_block *block = (midgard_block *)_block; - mir_foreach_instr_in_block_rev(block, ins) { - /* Ensure we write to a helper dependency */ - if (ins->dest >= temp_count || !BITSET_TEST(deps, ins->dest)) - continue; + mir_foreach_instr_in_block_rev(block, ins) { + /* Ensure we write to a helper dependency */ + if (ins->dest >= temp_count || !BITSET_TEST(deps, ins->dest)) + continue; - /* Then add all of our dependencies */ - mir_foreach_src(ins, s) { - if (ins->src[s] >= temp_count) - continue; + /* Then add all of our dependencies */ + mir_foreach_src(ins, s) { + if (ins->src[s] >= temp_count) + continue; - /* Progress if the dependency set changes */ - progress |= !BITSET_TEST(deps, ins->src[s]); - BITSET_SET(deps, ins->src[s]); - } - } + /* Progress if the dependency set changes */ + progress |= !BITSET_TEST(deps, ins->src[s]); + BITSET_SET(deps, ins->src[s]); + } + } - return progress; + return progress; } void mir_analyze_helper_requirements(compiler_context *ctx) { - mir_compute_temp_count(ctx); - unsigned temp_count = ctx->temp_count; - BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), BITSET_WORDS(temp_count)); + mir_compute_temp_count(ctx); + unsigned temp_count = ctx->temp_count; + BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), BITSET_WORDS(temp_count)); - /* Initialize with the sources of instructions consuming - * derivatives */ + /* Initialize with the sources of instructions consuming + * derivatives */ - mir_foreach_instr_global(ctx, ins) { - if (ins->type != TAG_TEXTURE_4) continue; - if (ins->dest >= ctx->temp_count) continue; - if (!mir_op_computes_derivatives(ctx->stage, ins->op)) continue; + mir_foreach_instr_global(ctx, ins) { + if (ins->type != TAG_TEXTURE_4) + continue; + if (ins->dest >= ctx->temp_count) + continue; + if (!mir_op_computes_derivatives(ctx->stage, ins->op)) + continue; - mir_foreach_src(ins, s) { - if (ins->src[s] < temp_count) - BITSET_SET(deps, ins->src[s]); - } - } + mir_foreach_src(ins, s) { + if (ins->src[s] < temp_count) + BITSET_SET(deps, ins->src[s]); + } + } - /* Propagate that up */ + /* Propagate that up */ - struct set *work_list = _mesa_set_create(NULL, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + struct set *work_list = + _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - struct set *visited = _mesa_set_create(NULL, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + struct set *visited = + _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - struct set_entry *cur = _mesa_set_add(work_list, pan_exit_block(&ctx->blocks)); + struct set_entry *cur = + _mesa_set_add(work_list, pan_exit_block(&ctx->blocks)); - do { - pan_block *blk = (struct pan_block *) cur->key; - _mesa_set_remove(work_list, cur); + do { + pan_block *blk = (struct pan_block *)cur->key; + _mesa_set_remove(work_list, cur); - bool progress = mir_helper_block_update(deps, blk, temp_count); + bool progress = mir_helper_block_update(deps, blk, temp_count); - if (progress || !_mesa_set_search(visited, blk)) { - pan_foreach_predecessor(blk, pred) - _mesa_set_add(work_list, pred); - } + if (progress || !_mesa_set_search(visited, blk)) { + pan_foreach_predecessor(blk, pred) + _mesa_set_add(work_list, pred); + } - _mesa_set_add(visited, blk); - } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); + _mesa_set_add(visited, blk); + } while ((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); - _mesa_set_destroy(visited, NULL); - _mesa_set_destroy(work_list, NULL); + _mesa_set_destroy(visited, NULL); + _mesa_set_destroy(work_list, NULL); - /* Set the execute bits */ + /* Set the execute bits */ - mir_foreach_instr_global(ctx, ins) { - if (ins->type != TAG_TEXTURE_4) continue; - if (ins->dest >= ctx->temp_count) continue; + mir_foreach_instr_global(ctx, ins) { + if (ins->type != TAG_TEXTURE_4) + continue; + if (ins->dest >= ctx->temp_count) + continue; - ins->helper_execute = BITSET_TEST(deps, ins->dest); - } + ins->helper_execute = BITSET_TEST(deps, ins->dest); + } - free(deps); + free(deps); } diff --git a/src/panfrost/midgard/midgard_liveness.c b/src/panfrost/midgard/midgard_liveness.c index 77103c9ea01..984c95f1bcd 100644 --- a/src/panfrost/midgard/midgard_liveness.c +++ b/src/panfrost/midgard/midgard_liveness.c @@ -27,36 +27,37 @@ void mir_liveness_ins_update(uint16_t *live, midgard_instruction *ins, unsigned max) { - /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */ + /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */ - pan_liveness_kill(live, ins->dest, max, mir_bytemask(ins)); + pan_liveness_kill(live, ins->dest, max, mir_bytemask(ins)); - mir_foreach_src(ins, src) { - unsigned node = ins->src[src]; - unsigned bytemask = mir_bytemask_of_read_components(ins, node); + mir_foreach_src(ins, src) { + unsigned node = ins->src[src]; + unsigned bytemask = mir_bytemask_of_read_components(ins, node); - pan_liveness_gen(live, node, max, bytemask); - } + pan_liveness_gen(live, node, max, bytemask); + } } static void mir_liveness_ins_update_wrap(uint16_t *live, void *ins, unsigned max) { - mir_liveness_ins_update(live, (midgard_instruction *) ins, max); + mir_liveness_ins_update(live, (midgard_instruction *)ins, max); } void mir_compute_liveness(compiler_context *ctx) { - /* If we already have fresh liveness, nothing to do */ - if (ctx->metadata & MIDGARD_METADATA_LIVENESS) - return; + /* If we already have fresh liveness, nothing to do */ + if (ctx->metadata & MIDGARD_METADATA_LIVENESS) + return; - mir_compute_temp_count(ctx); - pan_compute_liveness(&ctx->blocks, ctx->temp_count, mir_liveness_ins_update_wrap); + mir_compute_temp_count(ctx); + pan_compute_liveness(&ctx->blocks, ctx->temp_count, + mir_liveness_ins_update_wrap); - /* Liveness is now valid */ - ctx->metadata |= MIDGARD_METADATA_LIVENESS; + /* Liveness is now valid */ + ctx->metadata |= MIDGARD_METADATA_LIVENESS; } /* Once liveness data is no longer valid, call this */ @@ -64,32 +65,33 @@ mir_compute_liveness(compiler_context *ctx) void mir_invalidate_liveness(compiler_context *ctx) { - /* If we didn't already compute liveness, there's nothing to do */ - if (!(ctx->metadata & MIDGARD_METADATA_LIVENESS)) - return; + /* If we didn't already compute liveness, there's nothing to do */ + if (!(ctx->metadata & MIDGARD_METADATA_LIVENESS)) + return; - pan_free_liveness(&ctx->blocks); + pan_free_liveness(&ctx->blocks); - /* It's now invalid regardless */ - ctx->metadata &= ~MIDGARD_METADATA_LIVENESS; + /* It's now invalid regardless */ + ctx->metadata &= ~MIDGARD_METADATA_LIVENESS; } bool -mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src) +mir_is_live_after(compiler_context *ctx, midgard_block *block, + midgard_instruction *start, int src) { - mir_compute_liveness(ctx); + mir_compute_liveness(ctx); - /* Check whether we're live in the successors */ + /* Check whether we're live in the successors */ - if (pan_liveness_get(block->base.live_out, src, ctx->temp_count)) - return true; + if (pan_liveness_get(block->base.live_out, src, ctx->temp_count)) + return true; - /* Check the rest of the block for liveness */ + /* Check the rest of the block for liveness */ - mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) { - if (mir_has_arg(ins, src)) - return true; - } + mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) { + if (mir_has_arg(ins, src)) + return true; + } - return false; + return false; } diff --git a/src/panfrost/midgard/midgard_nir_lower_helper_writes.c b/src/panfrost/midgard/midgard_nir_lower_helper_writes.c index 51c4b7db5b6..de63a79b954 100644 --- a/src/panfrost/midgard/midgard_nir_lower_helper_writes.c +++ b/src/panfrost/midgard/midgard_nir_lower_helper_writes.c @@ -29,65 +29,63 @@ static bool nir_lower_helper_writes(nir_builder *b, nir_instr *instr, UNUSED void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - switch (intr->intrinsic) { - case nir_intrinsic_global_atomic_add: - case nir_intrinsic_global_atomic_and: - case nir_intrinsic_global_atomic_comp_swap: - case nir_intrinsic_global_atomic_exchange: - case nir_intrinsic_global_atomic_fadd: - case nir_intrinsic_global_atomic_fcomp_swap: - case nir_intrinsic_global_atomic_fmax: - case nir_intrinsic_global_atomic_fmin: - case nir_intrinsic_global_atomic_imax: - case nir_intrinsic_global_atomic_imin: - case nir_intrinsic_global_atomic_or: - case nir_intrinsic_global_atomic_umax: - case nir_intrinsic_global_atomic_umin: - case nir_intrinsic_global_atomic_xor: - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_comp_swap: - case nir_intrinsic_image_atomic_dec_wrap: - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_image_atomic_fadd: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_inc_wrap: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_image_store: - case nir_intrinsic_store_global: - break; - default: - return false; - } + switch (intr->intrinsic) { + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_comp_swap: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_fadd: + case nir_intrinsic_global_atomic_fcomp_swap: + case nir_intrinsic_global_atomic_fmax: + case nir_intrinsic_global_atomic_fmin: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_xor: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_atomic_dec_wrap: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_fadd: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_inc_wrap: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_store: + case nir_intrinsic_store_global: + break; + default: + return false; + } - b->cursor = nir_before_instr(instr); + b->cursor = nir_before_instr(instr); - nir_ssa_def *helper = nir_load_helper_invocation(b, 1); - nir_push_if(b, nir_inot(b, helper)); - nir_instr_remove(instr); - nir_builder_instr_insert(b, instr); - nir_pop_if(b, NULL); + nir_ssa_def *helper = nir_load_helper_invocation(b, 1); + nir_push_if(b, nir_inot(b, helper)); + nir_instr_remove(instr); + nir_builder_instr_insert(b, instr); + nir_pop_if(b, NULL); - return true; + return true; } bool midgard_nir_lower_helper_writes(nir_shader *shader) { - if (shader->info.stage != MESA_SHADER_FRAGMENT) - return false; + if (shader->info.stage != MESA_SHADER_FRAGMENT) + return false; - return nir_shader_instructions_pass(shader, - nir_lower_helper_writes, - nir_metadata_none, - NULL); + return nir_shader_instructions_pass(shader, nir_lower_helper_writes, + nir_metadata_none, NULL); } diff --git a/src/panfrost/midgard/midgard_nir_lower_image_bitsize.c b/src/panfrost/midgard/midgard_nir_lower_image_bitsize.c index f82f6e2ae35..69c18b9be0c 100644 --- a/src/panfrost/midgard/midgard_nir_lower_image_bitsize.c +++ b/src/panfrost/midgard/midgard_nir_lower_image_bitsize.c @@ -31,50 +31,48 @@ static bool nir_lower_image_bitsize(nir_builder *b, nir_instr *instr, UNUSED void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - switch (intr->intrinsic) { - case nir_intrinsic_image_load: - case nir_intrinsic_image_store: - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_comp_swap: - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_xor: - break; - default: - return false; - } + switch (intr->intrinsic) { + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_xor: + break; + default: + return false; + } - if (nir_src_bit_size(intr->src[1]) == 16) - return false; + if (nir_src_bit_size(intr->src[1]) == 16) + return false; - b->cursor = nir_before_instr(instr); + b->cursor = nir_before_instr(instr); - nir_ssa_def *coord = - nir_ssa_for_src(b, intr->src[1], - nir_src_num_components(intr->src[1])); + nir_ssa_def *coord = + nir_ssa_for_src(b, intr->src[1], nir_src_num_components(intr->src[1])); - nir_ssa_def *coord16 = nir_u2u16(b, coord); + nir_ssa_def *coord16 = nir_u2u16(b, coord); - nir_instr_rewrite_src(instr, &intr->src[1], nir_src_for_ssa(coord16)); + nir_instr_rewrite_src(instr, &intr->src[1], nir_src_for_ssa(coord16)); - return true; + return true; } bool midgard_nir_lower_image_bitsize(nir_shader *shader) { - return nir_shader_instructions_pass(shader, - nir_lower_image_bitsize, - nir_metadata_block_index | nir_metadata_dominance, - NULL); + return nir_shader_instructions_pass( + shader, nir_lower_image_bitsize, + nir_metadata_block_index | nir_metadata_dominance, NULL); } diff --git a/src/panfrost/midgard/midgard_ops.h b/src/panfrost/midgard/midgard_ops.h index 2a6e2c3bc32..a77e5e5089e 100644 --- a/src/panfrost/midgard/midgard_ops.h +++ b/src/panfrost/midgard/midgard_ops.h @@ -32,9 +32,9 @@ extern struct mir_ldst_op_props load_store_opcode_props[256]; extern struct mir_tex_op_props tex_opcode_props[16]; extern struct mir_tag_props midgard_tag_props[16]; -#define OP_IS_ATOMIC(op) (load_store_opcode_props[op].props & LDST_ATOMIC) +#define OP_IS_ATOMIC(op) (load_store_opcode_props[op].props & LDST_ATOMIC) #define OP_USES_ATTRIB(op) (load_store_opcode_props[op].props & LDST_ATTRIB) -#define OP_IS_STORE(op) (load_store_opcode_props[op].props & LDST_STORE) +#define OP_IS_STORE(op) (load_store_opcode_props[op].props & LDST_STORE) #define OP_HAS_ADDRESS(op) (load_store_opcode_props[op].props & LDST_ADDRESS) /* Is this opcode that of an integer (regardless of signedness)? Instruction @@ -43,38 +43,38 @@ extern struct mir_tag_props midgard_tag_props[16]; static inline bool midgard_is_integer_op(int op) { - return (op >= 0x40 && op <= 0x7E) || (op >= 0xA0 && op <= 0xC1); + return (op >= 0x40 && op <= 0x7E) || (op >= 0xA0 && op <= 0xC1); } static inline bool midgard_is_unsigned_op(int op) { - assert(midgard_is_integer_op(op)); + assert(midgard_is_integer_op(op)); - switch (op) { - case midgard_alu_op_uaddsat: - case midgard_alu_op_usubsat: - case midgard_alu_op_uwmul: - case midgard_alu_op_umin: - case midgard_alu_op_umax: - case midgard_alu_op_uavg: - case midgard_alu_op_uravg: - case midgard_alu_op_ushlsat: - case midgard_alu_op_uabsdiff: - case midgard_alu_op_ult: - case midgard_alu_op_ule: - case midgard_alu_op_uball_lt: - case midgard_alu_op_uball_lte: - case midgard_alu_op_ubany_lt: - case midgard_alu_op_ubany_lte: - case midgard_alu_op_u2f_rte: - case midgard_alu_op_u2f_rtz: - case midgard_alu_op_u2f_rtn: - case midgard_alu_op_u2f_rtp: - return true; - default: - return false; - } + switch (op) { + case midgard_alu_op_uaddsat: + case midgard_alu_op_usubsat: + case midgard_alu_op_uwmul: + case midgard_alu_op_umin: + case midgard_alu_op_umax: + case midgard_alu_op_uavg: + case midgard_alu_op_uravg: + case midgard_alu_op_ushlsat: + case midgard_alu_op_uabsdiff: + case midgard_alu_op_ult: + case midgard_alu_op_ule: + case midgard_alu_op_uball_lt: + case midgard_alu_op_uball_lte: + case midgard_alu_op_ubany_lt: + case midgard_alu_op_ubany_lte: + case midgard_alu_op_u2f_rte: + case midgard_alu_op_u2f_rtz: + case midgard_alu_op_u2f_rtn: + case midgard_alu_op_u2f_rtp: + return true; + default: + return false; + } } /* Does this opcode *write* an integer? Same as is_integer_op, unless it's a @@ -83,10 +83,10 @@ midgard_is_unsigned_op(int op) static inline bool midgard_is_integer_out_op(int op) { - bool is_int = midgard_is_integer_op(op); - bool is_conversion = alu_opcode_props[op].props & OP_TYPE_CONVERT; + bool is_int = midgard_is_integer_op(op); + bool is_conversion = alu_opcode_props[op].props & OP_TYPE_CONVERT; - return is_int ^ is_conversion; + return is_int ^ is_conversion; } /* Determines effective writemask, taking quirks and expansion into account */ @@ -94,17 +94,17 @@ midgard_is_integer_out_op(int op) static inline unsigned effective_writemask(midgard_alu_op op, unsigned existing_mask) { - /* Channel count is off-by-one to fit in two-bits (0 channel makes no - * sense) */ + /* Channel count is off-by-one to fit in two-bits (0 channel makes no + * sense) */ - unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[op].props); + unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[op].props); - /* If there is a fixed channel count, construct the appropriate mask */ + /* If there is a fixed channel count, construct the appropriate mask */ - if (channel_count) - return (1 << channel_count) - 1; + if (channel_count) + return (1 << channel_count) - 1; - return existing_mask; + return existing_mask; }; #endif diff --git a/src/panfrost/midgard/midgard_opt_copy_prop.c b/src/panfrost/midgard/midgard_opt_copy_prop.c index 667440aea19..7fa3fa35675 100644 --- a/src/panfrost/midgard/midgard_opt_copy_prop.c +++ b/src/panfrost/midgard/midgard_opt_copy_prop.c @@ -28,70 +28,78 @@ bool midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block) { - bool progress = false; + bool progress = false; - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->type != TAG_ALU_4) continue; - if (!OP_IS_MOVE(ins->op)) continue; - if (ins->is_pack) continue; + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) + continue; + if (!OP_IS_MOVE(ins->op)) + continue; + if (ins->is_pack) + continue; - unsigned from = ins->src[1]; - unsigned to = ins->dest; + unsigned from = ins->src[1]; + unsigned to = ins->dest; - /* We only work on pure SSA */ + /* We only work on pure SSA */ - if (to & PAN_IS_REG) continue; - if (from & PAN_IS_REG) continue; + if (to & PAN_IS_REG) + continue; + if (from & PAN_IS_REG) + continue; - /* Constant propagation is not handled here, either */ - if (ins->has_inline_constant) continue; - if (ins->has_constants) continue; + /* Constant propagation is not handled here, either */ + if (ins->has_inline_constant) + continue; + if (ins->has_constants) + continue; - /* Modifier propagation is not handled here */ - if (mir_nontrivial_mod(ins, 1, false)) continue; - if (mir_nontrivial_outmod(ins)) continue; + /* Modifier propagation is not handled here */ + if (mir_nontrivial_mod(ins, 1, false)) + continue; + if (mir_nontrivial_outmod(ins)) + continue; - /* Shortened arguments (bias for textures, extra load/store - * arguments, etc.) do not get a swizzle, only a start - * component and even that is restricted. Fragment writeout - * doesn't even get that much */ + /* Shortened arguments (bias for textures, extra load/store + * arguments, etc.) do not get a swizzle, only a start + * component and even that is restricted. Fragment writeout + * doesn't even get that much */ - bool skip = false; + bool skip = false; - mir_foreach_instr_global(ctx, q) { - bool is_tex = q->type == TAG_TEXTURE_4; - bool is_ldst = q->type == TAG_LOAD_STORE_4; - bool is_branch = q->compact_branch; + mir_foreach_instr_global(ctx, q) { + bool is_tex = q->type == TAG_TEXTURE_4; + bool is_ldst = q->type == TAG_LOAD_STORE_4; + bool is_branch = q->compact_branch; - if (!(is_tex || is_ldst || is_branch)) continue; + if (!(is_tex || is_ldst || is_branch)) + continue; - /* For textures, we get a real swizzle for the - * coordinate and the content. For stores, we get one. - * For loads, we get none. */ + /* For textures, we get a real swizzle for the + * coordinate and the content. For stores, we get one. + * For loads, we get none. */ - unsigned start = - is_tex ? 2 : - OP_IS_STORE(q->op) ? 1 : 0; + unsigned start = is_tex ? 2 : OP_IS_STORE(q->op) ? 1 : 0; - mir_foreach_src(q, s) { - if ((s >= start) && q->src[s] == to) { - skip = true; - break; - } - } - } + mir_foreach_src(q, s) { + if ((s >= start) && q->src[s] == to) { + skip = true; + break; + } + } + } - if (skip) - continue; + if (skip) + continue; - if (ctx->blend_src1 == to) - ctx->blend_src1 = from; + if (ctx->blend_src1 == to) + ctx->blend_src1 = from; - /* We're clear -- rewrite, composing the swizzle */ - mir_rewrite_index_src_swizzle(ctx, to, from, ins->swizzle[1]); - mir_remove_instruction(ins); - progress |= true; - } + /* We're clear -- rewrite, composing the swizzle */ + mir_rewrite_index_src_swizzle(ctx, to, from, ins->swizzle[1]); + mir_remove_instruction(ins); + progress |= true; + } - return progress; + return progress; } diff --git a/src/panfrost/midgard/midgard_opt_dce.c b/src/panfrost/midgard/midgard_opt_dce.c index f08972f1107..50a1f0a912d 100644 --- a/src/panfrost/midgard/midgard_opt_dce.c +++ b/src/panfrost/midgard/midgard_opt_dce.c @@ -22,8 +22,8 @@ * SOFTWARE. */ -#include "compiler.h" #include "util/u_memory.h" +#include "compiler.h" #include "midgard_ops.h" /* SIMD-aware dead code elimination. Perform liveness analysis step-by-step, @@ -33,97 +33,100 @@ static bool can_cull_mask(compiler_context *ctx, midgard_instruction *ins) { - if (ins->dest >= ctx->temp_count) - return false; + if (ins->dest >= ctx->temp_count) + return false; - if (ins->dest == ctx->blend_src1) - return false; + if (ins->dest == ctx->blend_src1) + return false; - if (ins->type == TAG_LOAD_STORE_4) - if (load_store_opcode_props[ins->op].props & LDST_SPECIAL_MASK) - return false; + if (ins->type == TAG_LOAD_STORE_4) + if (load_store_opcode_props[ins->op].props & LDST_SPECIAL_MASK) + return false; - return true; + return true; } static bool can_dce(midgard_instruction *ins) { - if (ins->mask) - return false; + if (ins->mask) + return false; - if (ins->compact_branch) - return false; + if (ins->compact_branch) + return false; - if (ins->type == TAG_LOAD_STORE_4) - if (load_store_opcode_props[ins->op].props & LDST_SIDE_FX) - return false; + if (ins->type == TAG_LOAD_STORE_4) + if (load_store_opcode_props[ins->op].props & LDST_SIDE_FX) + return false; - if (ins->type == TAG_TEXTURE_4) - if (ins->op == midgard_tex_op_barrier) - return false; + if (ins->type == TAG_TEXTURE_4) + if (ins->op == midgard_tex_op_barrier) + return false; - return true; + return true; } static bool -midgard_opt_dead_code_eliminate_block(compiler_context *ctx, midgard_block *block) +midgard_opt_dead_code_eliminate_block(compiler_context *ctx, + midgard_block *block) { - bool progress = false; + bool progress = false; - uint16_t *live = mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t)); + uint16_t *live = + mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t)); - mir_foreach_instr_in_block_rev(block, ins) { - if (can_cull_mask(ctx, ins)) { - unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); - unsigned round_size = type_size; - unsigned oldmask = ins->mask; + mir_foreach_instr_in_block_rev(block, ins) { + if (can_cull_mask(ctx, ins)) { + unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); + unsigned round_size = type_size; + unsigned oldmask = ins->mask; - /* Make sure we're packable */ - if (type_size < 32 && ins->type == TAG_LOAD_STORE_4) - round_size = 32; + /* Make sure we're packable */ + if (type_size < 32 && ins->type == TAG_LOAD_STORE_4) + round_size = 32; - unsigned rounded = mir_round_bytemask_up(live[ins->dest], round_size); - unsigned cmask = mir_from_bytemask(rounded, type_size); + unsigned rounded = mir_round_bytemask_up(live[ins->dest], round_size); + unsigned cmask = mir_from_bytemask(rounded, type_size); - ins->mask &= cmask; - progress |= (ins->mask != oldmask); - } + ins->mask &= cmask; + progress |= (ins->mask != oldmask); + } - mir_liveness_ins_update(live, ins, ctx->temp_count); - } + mir_liveness_ins_update(live, ins, ctx->temp_count); + } - mir_foreach_instr_in_block_safe(block, ins) { - if (can_dce(ins)) { - mir_remove_instruction(ins); - progress = true; - } - } + mir_foreach_instr_in_block_safe(block, ins) { + if (can_dce(ins)) { + mir_remove_instruction(ins); + progress = true; + } + } - free(live); + free(live); - return progress; + return progress; } bool midgard_opt_dead_code_eliminate(compiler_context *ctx) { - /* We track liveness. In fact, it's ok if we assume more things are - * live than they actually are, that just reduces the effectiveness of - * this iterations lightly. And DCE has the effect of strictly reducing - * liveness, so we can run DCE across all blocks while only computing - * liveness at the beginning. */ + /* We track liveness. In fact, it's ok if we assume more things are + * live than they actually are, that just reduces the effectiveness of + * this iterations lightly. And DCE has the effect of strictly reducing + * liveness, so we can run DCE across all blocks while only computing + * liveness at the beginning. */ - mir_invalidate_liveness(ctx); - mir_compute_liveness(ctx); + mir_invalidate_liveness(ctx); + mir_compute_liveness(ctx); - bool progress = false; + bool progress = false; - mir_foreach_block(ctx, block) { - progress |= midgard_opt_dead_code_eliminate_block(ctx, (midgard_block *) block); - } + mir_foreach_block(ctx, block) { + progress |= + midgard_opt_dead_code_eliminate_block(ctx, (midgard_block *)block); + } - return progress; + return progress; } /* Removes dead moves, that is, moves with a destination overwritten before @@ -133,36 +136,39 @@ midgard_opt_dead_code_eliminate(compiler_context *ctx) bool midgard_opt_dead_move_eliminate(compiler_context *ctx, midgard_block *block) { - bool progress = false; + bool progress = false; - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->type != TAG_ALU_4) continue; - if (ins->compact_branch) continue; - if (!OP_IS_MOVE(ins->op)) continue; + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) + continue; + if (ins->compact_branch) + continue; + if (!OP_IS_MOVE(ins->op)) + continue; - /* Check if it's overwritten in this block before being read */ - bool overwritten = false; + /* Check if it's overwritten in this block before being read */ + bool overwritten = false; - mir_foreach_instr_in_block_from(block, q, mir_next_op(ins)) { - /* Check if used */ - if (mir_has_arg(q, ins->dest)) - break; + mir_foreach_instr_in_block_from(block, q, mir_next_op(ins)) { + /* Check if used */ + if (mir_has_arg(q, ins->dest)) + break; - /* Check if overwritten */ - if (q->dest == ins->dest) { - /* Special case to vec4; component tracking is - * harder */ + /* Check if overwritten */ + if (q->dest == ins->dest) { + /* Special case to vec4; component tracking is + * harder */ - overwritten = (q->mask == 0xF); - break; - } - } + overwritten = (q->mask == 0xF); + break; + } + } - if (overwritten) { - mir_remove_instruction(ins); - progress = true; - } - } + if (overwritten) { + mir_remove_instruction(ins); + progress = true; + } + } - return progress; + return progress; } diff --git a/src/panfrost/midgard/midgard_opt_perspective.c b/src/panfrost/midgard/midgard_opt_perspective.c index c0f8ba83a1a..1e131992a0e 100644 --- a/src/panfrost/midgard/midgard_opt_perspective.c +++ b/src/panfrost/midgard/midgard_opt_perspective.c @@ -40,160 +40,190 @@ static bool is_swizzle_0(unsigned *swizzle) { - for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) - if (swizzle[c]) - return false; + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) + if (swizzle[c]) + return false; - return true; + return true; } bool midgard_opt_combine_projection(compiler_context *ctx, midgard_block *block) { - bool progress = false; + bool progress = false; - mir_foreach_instr_in_block_safe(block, ins) { - /* First search for fmul */ - if (ins->type != TAG_ALU_4) continue; - if (ins->op != midgard_alu_op_fmul) continue; + mir_foreach_instr_in_block_safe(block, ins) { + /* First search for fmul */ + if (ins->type != TAG_ALU_4) + continue; + if (ins->op != midgard_alu_op_fmul) + continue; - /* TODO: Flip */ + /* TODO: Flip */ - /* Check the swizzles */ - - if (!mir_is_simple_swizzle(ins->swizzle[0], ins->mask)) continue; - if (!is_swizzle_0(ins->swizzle[1])) continue; + /* Check the swizzles */ - /* Awesome, we're the right form. Now check where src2 is from */ - unsigned frcp = ins->src[1]; - unsigned to = ins->dest; + if (!mir_is_simple_swizzle(ins->swizzle[0], ins->mask)) + continue; + if (!is_swizzle_0(ins->swizzle[1])) + continue; - if (frcp & PAN_IS_REG) continue; - if (to & PAN_IS_REG) continue; + /* Awesome, we're the right form. Now check where src2 is from */ + unsigned frcp = ins->src[1]; + unsigned to = ins->dest; - bool frcp_found = false; - unsigned frcp_component = 0; - unsigned frcp_from = 0; + if (frcp & PAN_IS_REG) + continue; + if (to & PAN_IS_REG) + continue; - mir_foreach_instr_in_block_safe(block, sub) { - if (sub->dest != frcp) continue; + bool frcp_found = false; + unsigned frcp_component = 0; + unsigned frcp_from = 0; - frcp_component = sub->swizzle[0][0]; - frcp_from = sub->src[0]; + mir_foreach_instr_in_block_safe(block, sub) { + if (sub->dest != frcp) + continue; - frcp_found = - (sub->type == TAG_ALU_4) && - (sub->op == midgard_alu_op_frcp); - break; - } + frcp_component = sub->swizzle[0][0]; + frcp_from = sub->src[0]; - if (!frcp_found) continue; - if (frcp_from != ins->src[0]) continue; - if (frcp_component != COMPONENT_W && frcp_component != COMPONENT_Z) continue; - if (!mir_single_use(ctx, frcp)) continue; + frcp_found = + (sub->type == TAG_ALU_4) && (sub->op == midgard_alu_op_frcp); + break; + } - /* Heuristic: check if the frcp is from a single-use varying */ + if (!frcp_found) + continue; + if (frcp_from != ins->src[0]) + continue; + if (frcp_component != COMPONENT_W && frcp_component != COMPONENT_Z) + continue; + if (!mir_single_use(ctx, frcp)) + continue; - bool ok = false; + /* Heuristic: check if the frcp is from a single-use varying */ - /* One for frcp and one for fmul */ - if (mir_use_count(ctx, frcp_from) > 2) continue; + bool ok = false; - mir_foreach_instr_in_block_safe(block, v) { - if (v->dest != frcp_from) continue; - if (v->type != TAG_LOAD_STORE_4) break; - if (!OP_IS_LOAD_VARY_F(v->op)) break; + /* One for frcp and one for fmul */ + if (mir_use_count(ctx, frcp_from) > 2) + continue; - ok = true; - break; - } + mir_foreach_instr_in_block_safe(block, v) { + if (v->dest != frcp_from) + continue; + if (v->type != TAG_LOAD_STORE_4) + break; + if (!OP_IS_LOAD_VARY_F(v->op)) + break; - if (!ok) - continue; + ok = true; + break; + } - /* Nice, we got the form spot on. Let's convert! */ + if (!ok) + continue; - midgard_instruction accel = { - .type = TAG_LOAD_STORE_4, - .mask = ins->mask, - .dest = to, - .dest_type = nir_type_float32, - .src = { frcp_from, ~0, ~0, ~0, }, - .src_types = { nir_type_float32, }, - .swizzle = SWIZZLE_IDENTITY_4, - .op = frcp_component == COMPONENT_W ? - midgard_op_ldst_perspective_div_w : - midgard_op_ldst_perspective_div_z, - .load_store = { - .bitsize_toggle = true, - }, - }; + /* Nice, we got the form spot on. Let's convert! */ - mir_insert_instruction_before(ctx, ins, accel); - mir_remove_instruction(ins); + midgard_instruction accel = { + .type = TAG_LOAD_STORE_4, + .mask = ins->mask, + .dest = to, + .dest_type = nir_type_float32, + .src = + { + frcp_from, + ~0, + ~0, + ~0, + }, + .src_types = + { + nir_type_float32, + }, + .swizzle = SWIZZLE_IDENTITY_4, + .op = frcp_component == COMPONENT_W + ? midgard_op_ldst_perspective_div_w + : midgard_op_ldst_perspective_div_z, + .load_store = + { + .bitsize_toggle = true, + }, + }; - progress |= true; - } + mir_insert_instruction_before(ctx, ins, accel); + mir_remove_instruction(ins); - return progress; + progress |= true; + } + + return progress; } bool midgard_opt_varying_projection(compiler_context *ctx, midgard_block *block) { - bool progress = false; + bool progress = false; - mir_foreach_instr_in_block_safe(block, ins) { - /* Search for a projection */ - if (ins->type != TAG_LOAD_STORE_4) continue; - if (!OP_IS_PROJECTION(ins->op)) continue; + mir_foreach_instr_in_block_safe(block, ins) { + /* Search for a projection */ + if (ins->type != TAG_LOAD_STORE_4) + continue; + if (!OP_IS_PROJECTION(ins->op)) + continue; - unsigned vary = ins->src[0]; - unsigned to = ins->dest; + unsigned vary = ins->src[0]; + unsigned to = ins->dest; - if (vary & PAN_IS_REG) continue; - if (to & PAN_IS_REG) continue; - if (!mir_single_use(ctx, vary)) continue; + if (vary & PAN_IS_REG) + continue; + if (to & PAN_IS_REG) + continue; + if (!mir_single_use(ctx, vary)) + continue; - /* Check for a varying source. If we find it, we rewrite */ + /* Check for a varying source. If we find it, we rewrite */ - bool rewritten = false; + bool rewritten = false; - mir_foreach_instr_in_block_safe(block, v) { - if (v->dest != vary) continue; - if (v->type != TAG_LOAD_STORE_4) break; - if (!OP_IS_LOAD_VARY_F(v->op)) break; + mir_foreach_instr_in_block_safe(block, v) { + if (v->dest != vary) + continue; + if (v->type != TAG_LOAD_STORE_4) + break; + if (!OP_IS_LOAD_VARY_F(v->op)) + break; - /* We found it, so rewrite it to project. Grab the - * modifier */ + /* We found it, so rewrite it to project. Grab the + * modifier */ - midgard_varying_params p = - midgard_unpack_varying_params(v->load_store); + midgard_varying_params p = + midgard_unpack_varying_params(v->load_store); - if (p.modifier != midgard_varying_mod_none) - break; + if (p.modifier != midgard_varying_mod_none) + break; - bool projects_w = - ins->op == midgard_op_ldst_perspective_div_w; + bool projects_w = ins->op == midgard_op_ldst_perspective_div_w; - p.modifier = projects_w ? - midgard_varying_mod_perspective_w : - midgard_varying_mod_perspective_z; + p.modifier = projects_w ? midgard_varying_mod_perspective_w + : midgard_varying_mod_perspective_z; - midgard_pack_varying_params(&v->load_store, p); + midgard_pack_varying_params(&v->load_store, p); - /* Use the new destination */ - v->dest = to; + /* Use the new destination */ + v->dest = to; - rewritten = true; - break; - } + rewritten = true; + break; + } - if (rewritten) - mir_remove_instruction(ins); + if (rewritten) + mir_remove_instruction(ins); - progress |= rewritten; - } + progress |= rewritten; + } - return progress; + return progress; } diff --git a/src/panfrost/midgard/midgard_print.c b/src/panfrost/midgard/midgard_print.c index 06b92e032a5..6fe3746ab34 100644 --- a/src/panfrost/midgard/midgard_print.c +++ b/src/panfrost/midgard/midgard_print.c @@ -39,25 +39,25 @@ static void mir_print_index(int source) { - if (source == ~0) { - printf("_"); - return; - } + if (source == ~0) { + printf("_"); + return; + } - if (source >= SSA_FIXED_MINIMUM) { - /* Specific register */ - int reg = SSA_REG_FROM_FIXED(source); + if (source >= SSA_FIXED_MINIMUM) { + /* Specific register */ + int reg = SSA_REG_FROM_FIXED(source); - /* TODO: Moving threshold */ - if (reg > 16 && reg < 24) - printf("U%d", 23 - reg); - else - printf("R%d", reg); - } else if (source & PAN_IS_REG) { - printf("r%d", source >> 1); - } else { - printf("%d", source >> 1); - } + /* TODO: Moving threshold */ + if (reg > 16 && reg < 24) + printf("U%d", 23 - reg); + else + printf("R%d", reg); + } else if (source & PAN_IS_REG) { + printf("r%d", source >> 1); + } else { + printf("%d", source >> 1); + } } static const char components[16] = "xyzwefghijklmnop"; @@ -65,12 +65,12 @@ static const char components[16] = "xyzwefghijklmnop"; static void mir_print_mask(unsigned mask) { - printf("."); + printf("."); - for (unsigned i = 0; i < 16; ++i) { - if (mask & (1 << i)) - putchar(components[i]); - } + for (unsigned i = 0; i < 16; ++i) { + if (mask & (1 << i)) + putchar(components[i]); + } } /* @@ -81,246 +81,246 @@ mir_print_mask(unsigned mask) static void mir_print_swizzle(unsigned mask, unsigned *swizzle) { - printf("."); + printf("."); - for (unsigned i = 0; i < 16; ++i) { - if (mask & BITFIELD_BIT(i)) { - unsigned C = swizzle[i]; - putchar(components[C]); - } - } + for (unsigned i = 0; i < 16; ++i) { + if (mask & BITFIELD_BIT(i)) { + unsigned C = swizzle[i]; + putchar(components[C]); + } + } } static const char * mir_get_unit(unsigned unit) { - switch (unit) { - case ALU_ENAB_VEC_MUL: - return "vmul"; - case ALU_ENAB_SCAL_ADD: - return "sadd"; - case ALU_ENAB_VEC_ADD: - return "vadd"; - case ALU_ENAB_SCAL_MUL: - return "smul"; - case ALU_ENAB_VEC_LUT: - return "lut"; - case ALU_ENAB_BR_COMPACT: - return "br"; - case ALU_ENAB_BRANCH: - return "brx"; - default: - return "???"; - } + switch (unit) { + case ALU_ENAB_VEC_MUL: + return "vmul"; + case ALU_ENAB_SCAL_ADD: + return "sadd"; + case ALU_ENAB_VEC_ADD: + return "vadd"; + case ALU_ENAB_SCAL_MUL: + return "smul"; + case ALU_ENAB_VEC_LUT: + return "lut"; + case ALU_ENAB_BR_COMPACT: + return "br"; + case ALU_ENAB_BRANCH: + return "brx"; + default: + return "???"; + } } static void mir_print_embedded_constant(midgard_instruction *ins, unsigned src_idx) { - assert(src_idx <= 1); + assert(src_idx <= 1); - unsigned base_size = max_bitsize_for_alu(ins); - unsigned sz = nir_alu_type_get_type_size(ins->src_types[src_idx]); - bool half = (sz == (base_size >> 1)); - unsigned mod = mir_pack_mod(ins, src_idx, false); - unsigned *swizzle = ins->swizzle[src_idx]; - midgard_reg_mode reg_mode = reg_mode_for_bitsize(max_bitsize_for_alu(ins)); - unsigned comp_mask = effective_writemask(ins->op, ins->mask); - unsigned num_comp = util_bitcount(comp_mask); - unsigned max_comp = mir_components_for_type(ins->dest_type); - bool first = true; + unsigned base_size = max_bitsize_for_alu(ins); + unsigned sz = nir_alu_type_get_type_size(ins->src_types[src_idx]); + bool half = (sz == (base_size >> 1)); + unsigned mod = mir_pack_mod(ins, src_idx, false); + unsigned *swizzle = ins->swizzle[src_idx]; + midgard_reg_mode reg_mode = reg_mode_for_bitsize(max_bitsize_for_alu(ins)); + unsigned comp_mask = effective_writemask(ins->op, ins->mask); + unsigned num_comp = util_bitcount(comp_mask); + unsigned max_comp = mir_components_for_type(ins->dest_type); + bool first = true; - printf("#"); + printf("#"); - if (num_comp > 1) - printf("vec%d(", num_comp); + if (num_comp > 1) + printf("vec%d(", num_comp); - for (unsigned comp = 0; comp < max_comp; comp++) { - if (!(comp_mask & (1 << comp))) - continue; + for (unsigned comp = 0; comp < max_comp; comp++) { + if (!(comp_mask & (1 << comp))) + continue; - if (first) - first = false; - else - printf(", "); + if (first) + first = false; + else + printf(", "); - mir_print_constant_component(stdout, &ins->constants, - swizzle[comp], reg_mode, - half, mod, ins->op); - } + mir_print_constant_component(stdout, &ins->constants, swizzle[comp], + reg_mode, half, mod, ins->op); + } - if (num_comp > 1) - printf(")"); + if (num_comp > 1) + printf(")"); } static void mir_print_src(midgard_instruction *ins, unsigned c) { - mir_print_index(ins->src[c]); + mir_print_index(ins->src[c]); - if (ins->src[c] != ~0 && ins->src_types[c] != nir_type_invalid) { - pan_print_alu_type(ins->src_types[c], stdout); - mir_print_swizzle(ins->mask, ins->swizzle[c]); - } + if (ins->src[c] != ~0 && ins->src_types[c] != nir_type_invalid) { + pan_print_alu_type(ins->src_types[c], stdout); + mir_print_swizzle(ins->mask, ins->swizzle[c]); + } } void mir_print_instruction(midgard_instruction *ins) { - printf("\t"); + printf("\t"); - if (midgard_is_branch_unit(ins->unit)) { - const char *branch_target_names[] = { - "goto", "break", "continue", "discard" - }; + if (midgard_is_branch_unit(ins->unit)) { + const char *branch_target_names[] = {"goto", "break", "continue", + "discard"}; - printf("%s.", mir_get_unit(ins->unit)); - if (ins->branch.target_type == TARGET_DISCARD) - printf("discard."); - else if (ins->writeout) - printf("write."); - else if (ins->unit == ALU_ENAB_BR_COMPACT && - !ins->branch.conditional) - printf("uncond."); - else - printf("cond."); + printf("%s.", mir_get_unit(ins->unit)); + if (ins->branch.target_type == TARGET_DISCARD) + printf("discard."); + else if (ins->writeout) + printf("write."); + else if (ins->unit == ALU_ENAB_BR_COMPACT && !ins->branch.conditional) + printf("uncond."); + else + printf("cond."); - if (!ins->branch.conditional) - printf("always"); - else if (ins->branch.invert_conditional) - printf("false"); - else - printf("true"); + if (!ins->branch.conditional) + printf("always"); + else if (ins->branch.invert_conditional) + printf("false"); + else + printf("true"); - if (ins->writeout) { - printf(" (c: "); - mir_print_src(ins, 0); - printf(", z: "); - mir_print_src(ins, 2); - printf(", s: "); - mir_print_src(ins, 3); - printf(")"); - } + if (ins->writeout) { + printf(" (c: "); + mir_print_src(ins, 0); + printf(", z: "); + mir_print_src(ins, 2); + printf(", s: "); + mir_print_src(ins, 3); + printf(")"); + } - if (ins->branch.target_type != TARGET_DISCARD) - printf(" %s -> block(%d)\n", - ins->branch.target_type < 4 ? - branch_target_names[ins->branch.target_type] : "??", - ins->branch.target_block); + if (ins->branch.target_type != TARGET_DISCARD) + printf(" %s -> block(%d)\n", + ins->branch.target_type < 4 + ? branch_target_names[ins->branch.target_type] + : "??", + ins->branch.target_block); - return; - } + return; + } - switch (ins->type) { - case TAG_ALU_4: { - midgard_alu_op op = ins->op; - const char *name = alu_opcode_props[op].name; + switch (ins->type) { + case TAG_ALU_4: { + midgard_alu_op op = ins->op; + const char *name = alu_opcode_props[op].name; - if (ins->unit) - printf("%s.", mir_get_unit(ins->unit)); + if (ins->unit) + printf("%s.", mir_get_unit(ins->unit)); - printf("%s", name ? name : "??"); + printf("%s", name ? name : "??"); - if (!(midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_keeplo)) { - mir_print_outmod(stdout, ins->outmod, midgard_is_integer_out_op(ins->op)); - } + if (!(midgard_is_integer_out_op(ins->op) && + ins->outmod == midgard_outmod_keeplo)) { + mir_print_outmod(stdout, ins->outmod, + midgard_is_integer_out_op(ins->op)); + } - break; - } + break; + } - case TAG_LOAD_STORE_4: { - midgard_load_store_op op = ins->op; - const char *name = load_store_opcode_props[op].name; + case TAG_LOAD_STORE_4: { + midgard_load_store_op op = ins->op; + const char *name = load_store_opcode_props[op].name; - assert(name); - printf("%s", name); - break; - } + assert(name); + printf("%s", name); + break; + } - case TAG_TEXTURE_4: { - printf("TEX"); + case TAG_TEXTURE_4: { + printf("TEX"); - if (ins->helper_terminate) - printf(".terminate"); + if (ins->helper_terminate) + printf(".terminate"); - if (ins->helper_execute) - printf(".execute"); + if (ins->helper_execute) + printf(".execute"); - break; - } + break; + } - default: - assert(0); - } + default: + assert(0); + } - if (ins->compact_branch && ins->branch.invert_conditional) - printf(".not"); + if (ins->compact_branch && ins->branch.invert_conditional) + printf(".not"); - printf(" "); - mir_print_index(ins->dest); + printf(" "); + mir_print_index(ins->dest); - if (ins->dest != ~0) { - pan_print_alu_type(ins->dest_type, stdout); - mir_print_mask(ins->mask); - } + if (ins->dest != ~0) { + pan_print_alu_type(ins->dest_type, stdout); + mir_print_mask(ins->mask); + } - printf(", "); + printf(", "); - /* Only ALU can have an embedded constant, r26 as read on load/store is - * something else entirely */ - bool is_alu = ins->type == TAG_ALU_4; - unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + /* Only ALU can have an embedded constant, r26 as read on load/store is + * something else entirely */ + bool is_alu = ins->type == TAG_ALU_4; + unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); - if (is_alu && alu_opcode_props[ins->op].props & QUIRK_FLIPPED_R24) { - /* Moves (indicated by QUIRK_FLIPPED_R24) are 1-src, with their - * one source in the second slot - */ - assert(ins->src[0] == ~0); - } else { - if (ins->src[0] == r_constant && is_alu) - mir_print_embedded_constant(ins, 0); - else - mir_print_src(ins, 0); + if (is_alu && alu_opcode_props[ins->op].props & QUIRK_FLIPPED_R24) { + /* Moves (indicated by QUIRK_FLIPPED_R24) are 1-src, with their + * one source in the second slot + */ + assert(ins->src[0] == ~0); + } else { + if (ins->src[0] == r_constant && is_alu) + mir_print_embedded_constant(ins, 0); + else + mir_print_src(ins, 0); - printf(", "); - } + printf(", "); + } - if (ins->has_inline_constant) - printf("#%d", ins->inline_constant); - else if (ins->src[1] == r_constant && is_alu) - mir_print_embedded_constant(ins, 1); - else - mir_print_src(ins, 1); + if (ins->has_inline_constant) + printf("#%d", ins->inline_constant); + else if (ins->src[1] == r_constant && is_alu) + mir_print_embedded_constant(ins, 1); + else + mir_print_src(ins, 1); - if (is_alu) { - /* ALU ops are all 2-src, though CSEL is treated like a 3-src - * pseudo op with the third source scheduler lowered - */ - switch (ins->op) { - case midgard_alu_op_icsel: - case midgard_alu_op_fcsel: - case midgard_alu_op_icsel_v: - case midgard_alu_op_fcsel_v: - printf(", "); - mir_print_src(ins, 2); - break; - default: - assert(ins->src[2] == ~0); - break; - } + if (is_alu) { + /* ALU ops are all 2-src, though CSEL is treated like a 3-src + * pseudo op with the third source scheduler lowered + */ + switch (ins->op) { + case midgard_alu_op_icsel: + case midgard_alu_op_fcsel: + case midgard_alu_op_icsel_v: + case midgard_alu_op_fcsel_v: + printf(", "); + mir_print_src(ins, 2); + break; + default: + assert(ins->src[2] == ~0); + break; + } - assert(ins->src[3] == ~0); - } else { - for (unsigned c = 2; c <= 3; ++c) { - printf(", "); - mir_print_src(ins, c); - } - } + assert(ins->src[3] == ~0); + } else { + for (unsigned c = 2; c <= 3; ++c) { + printf(", "); + mir_print_src(ins, c); + } + } - if (ins->no_spill) - printf(" /* no spill */"); + if (ins->no_spill) + printf(" /* no spill */"); - printf("\n"); + printf("\n"); } /* Dumps MIR for a block or entire shader respective */ @@ -328,41 +328,41 @@ mir_print_instruction(midgard_instruction *ins) void mir_print_block(midgard_block *block) { - printf("block%u: {\n", block->base.name); + printf("block%u: {\n", block->base.name); - if (block->scheduled) { - mir_foreach_bundle_in_block(block, bundle) { - for (unsigned i = 0; i < bundle->instruction_count; ++i) - mir_print_instruction(bundle->instructions[i]); + if (block->scheduled) { + mir_foreach_bundle_in_block(block, bundle) { + for (unsigned i = 0; i < bundle->instruction_count; ++i) + mir_print_instruction(bundle->instructions[i]); - printf("\n"); - } - } else { - mir_foreach_instr_in_block(block, ins) { - mir_print_instruction(ins); - } - } + printf("\n"); + } + } else { + mir_foreach_instr_in_block(block, ins) { + mir_print_instruction(ins); + } + } - printf("}"); + printf("}"); - if (block->base.successors[0]) { - printf(" -> "); - pan_foreach_successor((&block->base), succ) - printf(" block%u ", succ->name); - } + if (block->base.successors[0]) { + printf(" -> "); + pan_foreach_successor((&block->base), succ) + printf(" block%u ", succ->name); + } - printf(" from { "); - mir_foreach_predecessor(block, pred) - printf("block%u ", pred->base.name); - printf("}"); + printf(" from { "); + mir_foreach_predecessor(block, pred) + printf("block%u ", pred->base.name); + printf("}"); - printf("\n\n"); + printf("\n\n"); } void mir_print_shader(compiler_context *ctx) { - mir_foreach_block(ctx, block) { - mir_print_block((midgard_block *) block); - } + mir_foreach_block(ctx, block) { + mir_print_block((midgard_block *)block); + } } diff --git a/src/panfrost/midgard/midgard_print_constant.c b/src/panfrost/midgard/midgard_print_constant.c index d588bc24bec..fec65701173 100644 --- a/src/panfrost/midgard/midgard_print_constant.c +++ b/src/panfrost/midgard/midgard_print_constant.c @@ -22,156 +22,152 @@ * SOFTWARE. */ -#include #include +#include #include "util/half_float.h" -#include "midgard.h" #include "helpers.h" +#include "midgard.h" #include "midgard_ops.h" void -mir_print_constant_component(FILE *fp, const midgard_constants *consts, unsigned c, - midgard_reg_mode reg_mode, bool half, +mir_print_constant_component(FILE *fp, const midgard_constants *consts, + unsigned c, midgard_reg_mode reg_mode, bool half, unsigned mod, midgard_alu_op op) { - bool is_sint = false, is_uint = false, is_hex = false; - const char *opname = alu_opcode_props[op].name; + bool is_sint = false, is_uint = false, is_hex = false; + const char *opname = alu_opcode_props[op].name; - bool is_int = midgard_is_integer_op(op); + bool is_int = midgard_is_integer_op(op); - /* Add a sentinel name to prevent crashing */ - if (!opname) - opname = "unknown"; + /* Add a sentinel name to prevent crashing */ + if (!opname) + opname = "unknown"; - if (is_int) { - is_uint = midgard_is_unsigned_op(op); + if (is_int) { + is_uint = midgard_is_unsigned_op(op); - if (!is_uint) { - /* Bit ops are easier to follow when the constant is printed in - * hexadecimal. Other operations starting with a 'i' are - * considered to operate on signed integers. That might not - * be true for all of them, but it's good enough for traces. - */ - if (op >= midgard_alu_op_iand && - op <= midgard_alu_op_ipopcnt) - is_hex = true; - else - is_sint = true; - } - } + if (!is_uint) { + /* Bit ops are easier to follow when the constant is printed in + * hexadecimal. Other operations starting with a 'i' are + * considered to operate on signed integers. That might not + * be true for all of them, but it's good enough for traces. + */ + if (op >= midgard_alu_op_iand && op <= midgard_alu_op_ipopcnt) + is_hex = true; + else + is_sint = true; + } + } - if (half) - reg_mode--; + if (half) + reg_mode--; - switch (reg_mode) { - case midgard_reg_mode_64: - if (is_sint) { - fprintf(fp, "%"PRIi64, consts->i64[c]); - } else if (is_uint) { - fprintf(fp, "%"PRIu64, consts->u64[c]); - } else if (is_hex) { - fprintf(fp, "0x%"PRIX64, consts->u64[c]); - } else { - double v = consts->f64[c]; + switch (reg_mode) { + case midgard_reg_mode_64: + if (is_sint) { + fprintf(fp, "%" PRIi64, consts->i64[c]); + } else if (is_uint) { + fprintf(fp, "%" PRIu64, consts->u64[c]); + } else if (is_hex) { + fprintf(fp, "0x%" PRIX64, consts->u64[c]); + } else { + double v = consts->f64[c]; - if (mod & MIDGARD_FLOAT_MOD_ABS) v = fabs(v); - if (mod & MIDGARD_FLOAT_MOD_NEG) v = -v; + if (mod & MIDGARD_FLOAT_MOD_ABS) + v = fabs(v); + if (mod & MIDGARD_FLOAT_MOD_NEG) + v = -v; - printf("%g", v); - } - break; + printf("%g", v); + } + break; - case midgard_reg_mode_32: - if (is_sint) { - int64_t v; + case midgard_reg_mode_32: + if (is_sint) { + int64_t v; - if (half && mod == midgard_int_zero_extend) - v = consts->u32[c]; - else if (half && mod == midgard_int_left_shift) - v = (uint64_t)consts->u32[c] << 32; - else - v = consts->i32[c]; + if (half && mod == midgard_int_zero_extend) + v = consts->u32[c]; + else if (half && mod == midgard_int_left_shift) + v = (uint64_t)consts->u32[c] << 32; + else + v = consts->i32[c]; - fprintf(fp, "%"PRIi64, v); - } else if (is_uint || is_hex) { - uint64_t v; + fprintf(fp, "%" PRIi64, v); + } else if (is_uint || is_hex) { + uint64_t v; - if (half && mod == midgard_int_left_shift) - v = (uint64_t)consts->u32[c] << 32; - else - v = consts->u32[c]; + if (half && mod == midgard_int_left_shift) + v = (uint64_t)consts->u32[c] << 32; + else + v = consts->u32[c]; - fprintf(fp, is_uint ? "%"PRIu64 : "0x%"PRIX64, v); - } else { - float v = consts->f32[c]; + fprintf(fp, is_uint ? "%" PRIu64 : "0x%" PRIX64, v); + } else { + float v = consts->f32[c]; - if (mod & MIDGARD_FLOAT_MOD_ABS) v = fabsf(v); - if (mod & MIDGARD_FLOAT_MOD_NEG) v = -v; + if (mod & MIDGARD_FLOAT_MOD_ABS) + v = fabsf(v); + if (mod & MIDGARD_FLOAT_MOD_NEG) + v = -v; - fprintf(fp, "%g", v); - } - break; + fprintf(fp, "%g", v); + } + break; - case midgard_reg_mode_16: - if (is_sint) { - int32_t v; + case midgard_reg_mode_16: + if (is_sint) { + int32_t v; - if (half && mod == midgard_int_zero_extend) - v = consts->u16[c]; - else if (half && mod == midgard_int_left_shift) - v = (uint32_t)consts->u16[c] << 16; - else - v = consts->i16[c]; + if (half && mod == midgard_int_zero_extend) + v = consts->u16[c]; + else if (half && mod == midgard_int_left_shift) + v = (uint32_t)consts->u16[c] << 16; + else + v = consts->i16[c]; - fprintf(fp, "%d", v); - } else if (is_uint || is_hex) { - uint32_t v; + fprintf(fp, "%d", v); + } else if (is_uint || is_hex) { + uint32_t v; - if (half && mod == midgard_int_left_shift) - v = (uint32_t)consts->u16[c] << 16; - else - v = consts->u16[c]; + if (half && mod == midgard_int_left_shift) + v = (uint32_t)consts->u16[c] << 16; + else + v = consts->u16[c]; - fprintf(fp, is_uint ? "%u" : "0x%X", v); - } else { - float v = _mesa_half_to_float(consts->f16[c]); + fprintf(fp, is_uint ? "%u" : "0x%X", v); + } else { + float v = _mesa_half_to_float(consts->f16[c]); - if (mod & MIDGARD_FLOAT_MOD_ABS) v = fabsf(v); - if (mod & MIDGARD_FLOAT_MOD_NEG) v = -v; + if (mod & MIDGARD_FLOAT_MOD_ABS) + v = fabsf(v); + if (mod & MIDGARD_FLOAT_MOD_NEG) + v = -v; - fprintf(fp, "%g", v); - } - break; + fprintf(fp, "%g", v); + } + break; - case midgard_reg_mode_8: - fprintf(fp, "0x%X", consts->u8[c]); + case midgard_reg_mode_8: + fprintf(fp, "0x%X", consts->u8[c]); - if (mod) - fprintf(fp, " /* %u */", mod); + if (mod) + fprintf(fp, " /* %u */", mod); - assert(!half); /* No 4-bit */ + assert(!half); /* No 4-bit */ - break; - } + break; + } } -static char *outmod_names_float[4] = { - "", - ".clamp_0_inf", - ".clamp_m1_1", - ".clamp_0_1" -}; +static char *outmod_names_float[4] = {"", ".clamp_0_inf", ".clamp_m1_1", + ".clamp_0_1"}; -static char *outmod_names_int[4] = { - ".ssat", - ".usat", - ".keeplo", - ".keephi" -}; +static char *outmod_names_int[4] = {".ssat", ".usat", ".keeplo", ".keephi"}; void mir_print_outmod(FILE *fp, unsigned outmod, bool is_int) { - fprintf(fp, "%s", is_int ? outmod_names_int[outmod] : - outmod_names_float[outmod]); + fprintf(fp, "%s", + is_int ? outmod_names_int[outmod] : outmod_names_float[outmod]); } diff --git a/src/panfrost/midgard/midgard_quirks.h b/src/panfrost/midgard/midgard_quirks.h index 3e7c2a0280e..3003dbdf7c2 100644 --- a/src/panfrost/midgard/midgard_quirks.h +++ b/src/panfrost/midgard/midgard_quirks.h @@ -69,36 +69,30 @@ static inline unsigned midgard_get_quirks(unsigned gpu_id) { - switch (gpu_id) { - case 0x600: - case 0x620: - return MIDGARD_OLD_BLEND | - MIDGARD_BROKEN_BLEND_LOADS | - MIDGARD_BROKEN_LOD | - MIDGARD_NO_UPPER_ALU | - MIDGARD_NO_OOO; + switch (gpu_id) { + case 0x600: + case 0x620: + return MIDGARD_OLD_BLEND | MIDGARD_BROKEN_BLEND_LOADS | + MIDGARD_BROKEN_LOD | MIDGARD_NO_UPPER_ALU | MIDGARD_NO_OOO; - case 0x720: - return MIDGARD_INTERPIPE_REG_ALIASING | - MIDGARD_OLD_BLEND | - MIDGARD_BROKEN_LOD | - MIDGARD_NO_UPPER_ALU | - MIDGARD_NO_OOO; + case 0x720: + return MIDGARD_INTERPIPE_REG_ALIASING | MIDGARD_OLD_BLEND | + MIDGARD_BROKEN_LOD | MIDGARD_NO_UPPER_ALU | MIDGARD_NO_OOO; - case 0x820: - case 0x830: - return MIDGARD_INTERPIPE_REG_ALIASING; + case 0x820: + case 0x830: + return MIDGARD_INTERPIPE_REG_ALIASING; - case 0x750: - return MIDGARD_NO_UPPER_ALU; + case 0x750: + return MIDGARD_NO_UPPER_ALU; - case 0x860: - case 0x880: - return 0; + case 0x860: + case 0x880: + return 0; - default: - unreachable("Invalid Midgard GPU ID"); - } + default: + unreachable("Invalid Midgard GPU ID"); + } } #endif diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c index 99f544c4e16..bae8e695327 100644 --- a/src/panfrost/midgard/midgard_ra.c +++ b/src/panfrost/midgard/midgard_ra.c @@ -22,44 +22,45 @@ * SOFTWARE. */ -#include "compiler.h" -#include "midgard_ops.h" #include "util/u_math.h" #include "util/u_memory.h" +#include "compiler.h" +#include "midgard_ops.h" #include "midgard_quirks.h" struct phys_reg { - /* Physical register: 0-31 */ - unsigned reg; + /* Physical register: 0-31 */ + unsigned reg; - /* Byte offset into the physical register: 0-15 */ - unsigned offset; + /* Byte offset into the physical register: 0-15 */ + unsigned offset; - /* log2(bytes per component) for fast mul/div */ - unsigned shift; + /* log2(bytes per component) for fast mul/div */ + unsigned shift; }; /* Shift up by reg_offset and horizontally by dst_offset. */ static void -offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcshift, unsigned dstshift, unsigned dst_offset) +offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcshift, + unsigned dstshift, unsigned dst_offset) { - unsigned out[MIR_VEC_COMPONENTS]; + unsigned out[MIR_VEC_COMPONENTS]; - signed reg_comp = reg_offset >> srcshift; - signed dst_comp = dst_offset >> dstshift; + signed reg_comp = reg_offset >> srcshift; + signed dst_comp = dst_offset >> dstshift; - unsigned max_component = (16 >> srcshift) - 1; + unsigned max_component = (16 >> srcshift) - 1; - assert(reg_comp << srcshift == reg_offset); - assert(dst_comp << dstshift == dst_offset); + assert(reg_comp << srcshift == reg_offset); + assert(dst_comp << dstshift == dst_offset); - for (signed c = 0; c < MIR_VEC_COMPONENTS; ++c) { - signed comp = MAX2(c - dst_comp, 0); - out[c] = MIN2(swizzle[comp] + reg_comp, max_component); - } + for (signed c = 0; c < MIR_VEC_COMPONENTS; ++c) { + signed comp = MAX2(c - dst_comp, 0); + out[c] = MIN2(swizzle[comp] + reg_comp, max_component); + } - memcpy(swizzle, out, sizeof(out)); + memcpy(swizzle, out, sizeof(out)); } /* Helper to return the default phys_reg for a given register */ @@ -67,50 +68,51 @@ offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcshift, unsign static struct phys_reg default_phys_reg(int reg, unsigned shift) { - struct phys_reg r = { - .reg = reg, - .offset = 0, - .shift = shift, - }; + struct phys_reg r = { + .reg = reg, + .offset = 0, + .shift = shift, + }; - return r; + return r; } /* Determine which physical register, swizzle, and mask a virtual * register corresponds to */ static struct phys_reg -index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg, unsigned shift) +index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg, + unsigned shift) { - /* Check for special cases */ - if (reg == ~0) - return default_phys_reg(REGISTER_UNUSED, shift); - else if (reg >= SSA_FIXED_MINIMUM) - return default_phys_reg(SSA_REG_FROM_FIXED(reg), shift); - else if (!l) - return default_phys_reg(REGISTER_UNUSED, shift); + /* Check for special cases */ + if (reg == ~0) + return default_phys_reg(REGISTER_UNUSED, shift); + else if (reg >= SSA_FIXED_MINIMUM) + return default_phys_reg(SSA_REG_FROM_FIXED(reg), shift); + else if (!l) + return default_phys_reg(REGISTER_UNUSED, shift); - struct phys_reg r = { - .reg = l->solutions[reg] / 16, - .offset = l->solutions[reg] & 0xF, - .shift = shift, - }; + struct phys_reg r = { + .reg = l->solutions[reg] / 16, + .offset = l->solutions[reg] & 0xF, + .shift = shift, + }; - /* Report that we actually use this register, and return it */ + /* Report that we actually use this register, and return it */ - if (r.reg < 16) - ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, r.reg + 1); + if (r.reg < 16) + ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, r.reg + 1); - return r; + return r; } static void set_class(unsigned *classes, unsigned node, unsigned class) { - if (node < SSA_FIXED_MINIMUM && class != classes[node]) { - assert(classes[node] == REG_CLASS_WORK); - classes[node] = class; - } + if (node < SSA_FIXED_MINIMUM && class != classes[node]) { + assert(classes[node] == REG_CLASS_WORK); + classes[node] = class; + } } /* Special register classes impose special constraints on who can read their @@ -119,42 +121,42 @@ set_class(unsigned *classes, unsigned node, unsigned class) static bool ASSERTED check_read_class(unsigned *classes, unsigned tag, unsigned node) { - /* Non-nodes are implicitly ok */ - if (node >= SSA_FIXED_MINIMUM) - return true; + /* Non-nodes are implicitly ok */ + if (node >= SSA_FIXED_MINIMUM) + return true; - switch (classes[node]) { - case REG_CLASS_LDST: - return (tag == TAG_LOAD_STORE_4); - case REG_CLASS_TEXR: - return (tag == TAG_TEXTURE_4); - case REG_CLASS_TEXW: - return (tag != TAG_LOAD_STORE_4); - case REG_CLASS_WORK: - return IS_ALU(tag); - default: - unreachable("Invalid class"); - } + switch (classes[node]) { + case REG_CLASS_LDST: + return (tag == TAG_LOAD_STORE_4); + case REG_CLASS_TEXR: + return (tag == TAG_TEXTURE_4); + case REG_CLASS_TEXW: + return (tag != TAG_LOAD_STORE_4); + case REG_CLASS_WORK: + return IS_ALU(tag); + default: + unreachable("Invalid class"); + } } static bool ASSERTED check_write_class(unsigned *classes, unsigned tag, unsigned node) { - /* Non-nodes are implicitly ok */ - if (node >= SSA_FIXED_MINIMUM) - return true; + /* Non-nodes are implicitly ok */ + if (node >= SSA_FIXED_MINIMUM) + return true; - switch (classes[node]) { - case REG_CLASS_TEXR: - return true; - case REG_CLASS_TEXW: - return (tag == TAG_TEXTURE_4); - case REG_CLASS_LDST: - case REG_CLASS_WORK: - return IS_ALU(tag) || (tag == TAG_LOAD_STORE_4); - default: - unreachable("Invalid class"); - } + switch (classes[node]) { + case REG_CLASS_TEXR: + return true; + case REG_CLASS_TEXW: + return (tag == TAG_TEXTURE_4); + case REG_CLASS_LDST: + case REG_CLASS_WORK: + return IS_ALU(tag) || (tag == TAG_LOAD_STORE_4); + default: + unreachable("Invalid class"); + } } /* Prepass before RA to ensure special class restrictions are met. The idea is @@ -162,284 +164,287 @@ check_write_class(unsigned *classes, unsigned tag, unsigned node) * Later, we'll add moves as appropriate and rewrite to specialize by type. */ static void -mark_node_class (unsigned *bitfield, unsigned node) +mark_node_class(unsigned *bitfield, unsigned node) { - if (node < SSA_FIXED_MINIMUM) - BITSET_SET(bitfield, node); + if (node < SSA_FIXED_MINIMUM) + BITSET_SET(bitfield, node); } void mir_lower_special_reads(compiler_context *ctx) { - size_t sz = BITSET_WORDS(ctx->temp_count) * sizeof(BITSET_WORD); + size_t sz = BITSET_WORDS(ctx->temp_count) * sizeof(BITSET_WORD); - /* Bitfields for the various types of registers we could have. aluw can - * be written by either ALU or load/store */ + /* Bitfields for the various types of registers we could have. aluw can + * be written by either ALU or load/store */ - unsigned *alur = calloc(sz, 1); - unsigned *aluw = calloc(sz, 1); - unsigned *brar = calloc(sz, 1); - unsigned *ldst = calloc(sz, 1); - unsigned *texr = calloc(sz, 1); - unsigned *texw = calloc(sz, 1); + unsigned *alur = calloc(sz, 1); + unsigned *aluw = calloc(sz, 1); + unsigned *brar = calloc(sz, 1); + unsigned *ldst = calloc(sz, 1); + unsigned *texr = calloc(sz, 1); + unsigned *texw = calloc(sz, 1); - /* Pass #1 is analysis, a linear scan to fill out the bitfields */ + /* Pass #1 is analysis, a linear scan to fill out the bitfields */ - mir_foreach_instr_global(ctx, ins) { - switch (ins->type) { - case TAG_ALU_4: - mark_node_class(aluw, ins->dest); - mark_node_class(alur, ins->src[0]); - mark_node_class(alur, ins->src[1]); - mark_node_class(alur, ins->src[2]); + mir_foreach_instr_global(ctx, ins) { + switch (ins->type) { + case TAG_ALU_4: + mark_node_class(aluw, ins->dest); + mark_node_class(alur, ins->src[0]); + mark_node_class(alur, ins->src[1]); + mark_node_class(alur, ins->src[2]); - if (ins->compact_branch && ins->writeout) - mark_node_class(brar, ins->src[0]); + if (ins->compact_branch && ins->writeout) + mark_node_class(brar, ins->src[0]); - break; + break; - case TAG_LOAD_STORE_4: - mark_node_class(aluw, ins->dest); - mark_node_class(ldst, ins->src[0]); - mark_node_class(ldst, ins->src[1]); - mark_node_class(ldst, ins->src[2]); - mark_node_class(ldst, ins->src[3]); - break; + case TAG_LOAD_STORE_4: + mark_node_class(aluw, ins->dest); + mark_node_class(ldst, ins->src[0]); + mark_node_class(ldst, ins->src[1]); + mark_node_class(ldst, ins->src[2]); + mark_node_class(ldst, ins->src[3]); + break; - case TAG_TEXTURE_4: - mark_node_class(texr, ins->src[0]); - mark_node_class(texr, ins->src[1]); - mark_node_class(texr, ins->src[2]); - mark_node_class(texw, ins->dest); - break; + case TAG_TEXTURE_4: + mark_node_class(texr, ins->src[0]); + mark_node_class(texr, ins->src[1]); + mark_node_class(texr, ins->src[2]); + mark_node_class(texw, ins->dest); + break; - default: - break; - } - } + default: + break; + } + } - /* Pass #2 is lowering now that we've analyzed all the classes. - * Conceptually, if an index is only marked for a single type of use, - * there is nothing to lower. If it is marked for different uses, we - * split up based on the number of types of uses. To do so, we divide - * into N distinct classes of use (where N>1 by definition), emit N-1 - * moves from the index to copies of the index, and finally rewrite N-1 - * of the types of uses to use the corresponding move */ + /* Pass #2 is lowering now that we've analyzed all the classes. + * Conceptually, if an index is only marked for a single type of use, + * there is nothing to lower. If it is marked for different uses, we + * split up based on the number of types of uses. To do so, we divide + * into N distinct classes of use (where N>1 by definition), emit N-1 + * moves from the index to copies of the index, and finally rewrite N-1 + * of the types of uses to use the corresponding move */ - unsigned spill_idx = ctx->temp_count; + unsigned spill_idx = ctx->temp_count; - for (unsigned i = 0; i < ctx->temp_count; ++i) { - bool is_alur = BITSET_TEST(alur, i); - bool is_aluw = BITSET_TEST(aluw, i); - bool is_brar = BITSET_TEST(brar, i); - bool is_ldst = BITSET_TEST(ldst, i); - bool is_texr = BITSET_TEST(texr, i); - bool is_texw = BITSET_TEST(texw, i); + for (unsigned i = 0; i < ctx->temp_count; ++i) { + bool is_alur = BITSET_TEST(alur, i); + bool is_aluw = BITSET_TEST(aluw, i); + bool is_brar = BITSET_TEST(brar, i); + bool is_ldst = BITSET_TEST(ldst, i); + bool is_texr = BITSET_TEST(texr, i); + bool is_texw = BITSET_TEST(texw, i); - /* Analyse to check how many distinct uses there are. ALU ops - * (alur) can read the results of the texture pipeline (texw) - * but not ldst or texr. Load/store ops (ldst) cannot read - * anything but load/store inputs. Texture pipeline cannot read - * anything but texture inputs. TODO: Simplify. */ + /* Analyse to check how many distinct uses there are. ALU ops + * (alur) can read the results of the texture pipeline (texw) + * but not ldst or texr. Load/store ops (ldst) cannot read + * anything but load/store inputs. Texture pipeline cannot read + * anything but texture inputs. TODO: Simplify. */ - bool collision = - (is_alur && (is_ldst || is_texr)) || - (is_ldst && (is_alur || is_texr || is_texw)) || - (is_texr && (is_alur || is_ldst || is_texw)) || - (is_texw && (is_aluw || is_ldst || is_texr)) || - (is_brar && is_texw); - - if (!collision) - continue; + bool collision = (is_alur && (is_ldst || is_texr)) || + (is_ldst && (is_alur || is_texr || is_texw)) || + (is_texr && (is_alur || is_ldst || is_texw)) || + (is_texw && (is_aluw || is_ldst || is_texr)) || + (is_brar && is_texw); - /* Use the index as-is as the work copy. Emit copies for - * special uses */ + if (!collision) + continue; - unsigned classes[] = { TAG_LOAD_STORE_4, TAG_TEXTURE_4, TAG_TEXTURE_4, TAG_ALU_4}; - bool collisions[] = { is_ldst, is_texr, is_texw && is_aluw, is_brar }; + /* Use the index as-is as the work copy. Emit copies for + * special uses */ - for (unsigned j = 0; j < ARRAY_SIZE(collisions); ++j) { - if (!collisions[j]) continue; + unsigned classes[] = {TAG_LOAD_STORE_4, TAG_TEXTURE_4, TAG_TEXTURE_4, + TAG_ALU_4}; + bool collisions[] = {is_ldst, is_texr, is_texw && is_aluw, is_brar}; - /* When the hazard is from reading, we move and rewrite - * sources (typical case). When it's from writing, we - * flip the move and rewrite destinations (obscure, - * only from control flow -- impossible in SSA) */ + for (unsigned j = 0; j < ARRAY_SIZE(collisions); ++j) { + if (!collisions[j]) + continue; - bool hazard_write = (j == 2); + /* When the hazard is from reading, we move and rewrite + * sources (typical case). When it's from writing, we + * flip the move and rewrite destinations (obscure, + * only from control flow -- impossible in SSA) */ - unsigned idx = spill_idx++; + bool hazard_write = (j == 2); - /* Insert move before each read/write, depending on the - * hazard we're trying to account for */ + unsigned idx = spill_idx++; - mir_foreach_instr_global_safe(ctx, pre_use) { - if (pre_use->type != classes[j]) - continue; + /* Insert move before each read/write, depending on the + * hazard we're trying to account for */ - if (hazard_write) { - if (pre_use->dest != i) - continue; + mir_foreach_instr_global_safe(ctx, pre_use) { + if (pre_use->type != classes[j]) + continue; - midgard_instruction m = v_mov(idx, i); - m.dest_type = pre_use->dest_type; - m.src_types[1] = m.dest_type; - m.mask = pre_use->mask; + if (hazard_write) { + if (pre_use->dest != i) + continue; - midgard_instruction *use = mir_next_op(pre_use); - assert(use); - mir_insert_instruction_before(ctx, use, m); - mir_rewrite_index_dst_single(pre_use, i, idx); - } else { - if (!mir_has_arg(pre_use, i)) - continue; + midgard_instruction m = v_mov(idx, i); + m.dest_type = pre_use->dest_type; + m.src_types[1] = m.dest_type; + m.mask = pre_use->mask; - idx = spill_idx++; + midgard_instruction *use = mir_next_op(pre_use); + assert(use); + mir_insert_instruction_before(ctx, use, m); + mir_rewrite_index_dst_single(pre_use, i, idx); + } else { + if (!mir_has_arg(pre_use, i)) + continue; - midgard_instruction m = v_mov(i, idx); - m.mask = mir_from_bytemask(mir_round_bytemask_up( - mir_bytemask_of_read_components(pre_use, i), 32), 32); - mir_insert_instruction_before(ctx, pre_use, m); - mir_rewrite_index_src_single(pre_use, i, idx); - } - } - } - } + idx = spill_idx++; - free(alur); - free(aluw); - free(brar); - free(ldst); - free(texr); - free(texw); + midgard_instruction m = v_mov(i, idx); + m.mask = mir_from_bytemask( + mir_round_bytemask_up( + mir_bytemask_of_read_components(pre_use, i), 32), + 32); + mir_insert_instruction_before(ctx, pre_use, m); + mir_rewrite_index_src_single(pre_use, i, idx); + } + } + } + } + + free(alur); + free(aluw); + free(brar); + free(ldst); + free(texr); + free(texw); } static void -mir_compute_interference( - compiler_context *ctx, - struct lcra_state *l) +mir_compute_interference(compiler_context *ctx, struct lcra_state *l) { - /* First, we need liveness information to be computed per block */ - mir_compute_liveness(ctx); + /* First, we need liveness information to be computed per block */ + mir_compute_liveness(ctx); - /* We need to force r1.w live throughout a blend shader */ + /* We need to force r1.w live throughout a blend shader */ - if (ctx->inputs->is_blend) { - unsigned r1w = ~0; + if (ctx->inputs->is_blend) { + unsigned r1w = ~0; - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - mir_foreach_instr_in_block_rev(block, ins) { - if (ins->writeout) - r1w = ins->dest; - } + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + mir_foreach_instr_in_block_rev(block, ins) { + if (ins->writeout) + r1w = ins->dest; + } - if (r1w != ~0) - break; - } + if (r1w != ~0) + break; + } - mir_foreach_instr_global(ctx, ins) { - if (ins->dest < ctx->temp_count) - lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), r1w, 0xF); - } - } + mir_foreach_instr_global(ctx, ins) { + if (ins->dest < ctx->temp_count) + lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), r1w, + 0xF); + } + } - /* Now that every block has live_in/live_out computed, we can determine - * interference by walking each block linearly. Take live_out at the - * end of each block and walk the block backwards. */ + /* Now that every block has live_in/live_out computed, we can determine + * interference by walking each block linearly. Take live_out at the + * end of each block and walk the block backwards. */ - mir_foreach_block(ctx, _blk) { - midgard_block *blk = (midgard_block *) _blk; + mir_foreach_block(ctx, _blk) { + midgard_block *blk = (midgard_block *)_blk; - /* The scalar and vector units run in parallel. We need to make - * sure they don't write to same portion of the register file - * otherwise the result is undefined. Add interferences to - * avoid this situation. - */ - util_dynarray_foreach(&blk->bundles, midgard_bundle, bundle) { - midgard_instruction *instrs[2][4]; - unsigned instr_count[2] = { 0, 0 }; + /* The scalar and vector units run in parallel. We need to make + * sure they don't write to same portion of the register file + * otherwise the result is undefined. Add interferences to + * avoid this situation. + */ + util_dynarray_foreach(&blk->bundles, midgard_bundle, bundle) { + midgard_instruction *instrs[2][4]; + unsigned instr_count[2] = {0, 0}; - for (unsigned i = 0; i < bundle->instruction_count; i++) { - if (bundle->instructions[i]->unit == UNIT_VMUL || - bundle->instructions[i]->unit == UNIT_SADD) - instrs[0][instr_count[0]++] = bundle->instructions[i]; - else - instrs[1][instr_count[1]++] = bundle->instructions[i]; - } + for (unsigned i = 0; i < bundle->instruction_count; i++) { + if (bundle->instructions[i]->unit == UNIT_VMUL || + bundle->instructions[i]->unit == UNIT_SADD) + instrs[0][instr_count[0]++] = bundle->instructions[i]; + else + instrs[1][instr_count[1]++] = bundle->instructions[i]; + } - for (unsigned i = 0; i < ARRAY_SIZE(instr_count); i++) { - for (unsigned j = 0; j < instr_count[i]; j++) { - midgard_instruction *ins_a = instrs[i][j]; + for (unsigned i = 0; i < ARRAY_SIZE(instr_count); i++) { + for (unsigned j = 0; j < instr_count[i]; j++) { + midgard_instruction *ins_a = instrs[i][j]; - if (ins_a->dest >= ctx->temp_count) continue; + if (ins_a->dest >= ctx->temp_count) + continue; - for (unsigned k = j + 1; k < instr_count[i]; k++) { - midgard_instruction *ins_b = instrs[i][k]; + for (unsigned k = j + 1; k < instr_count[i]; k++) { + midgard_instruction *ins_b = instrs[i][k]; - if (ins_b->dest >= ctx->temp_count) continue; + if (ins_b->dest >= ctx->temp_count) + continue; - lcra_add_node_interference(l, ins_b->dest, - mir_bytemask(ins_b), - ins_a->dest, - mir_bytemask(ins_a)); - } - } - } - } + lcra_add_node_interference(l, ins_b->dest, + mir_bytemask(ins_b), ins_a->dest, + mir_bytemask(ins_a)); + } + } + } + } - uint16_t *live = mem_dup(_blk->live_out, ctx->temp_count * sizeof(uint16_t)); + uint16_t *live = + mem_dup(_blk->live_out, ctx->temp_count * sizeof(uint16_t)); - mir_foreach_instr_in_block_rev(blk, ins) { - /* Mark all registers live after the instruction as - * interfering with the destination */ + mir_foreach_instr_in_block_rev(blk, ins) { + /* Mark all registers live after the instruction as + * interfering with the destination */ - unsigned dest = ins->dest; + unsigned dest = ins->dest; - if (dest < ctx->temp_count) { - for (unsigned i = 0; i < ctx->temp_count; ++i) { - if (live[i]) { - unsigned mask = mir_bytemask(ins); - lcra_add_node_interference(l, dest, mask, i, live[i]); - } - } - } + if (dest < ctx->temp_count) { + for (unsigned i = 0; i < ctx->temp_count; ++i) { + if (live[i]) { + unsigned mask = mir_bytemask(ins); + lcra_add_node_interference(l, dest, mask, i, live[i]); + } + } + } - /* Add blend shader interference: blend shaders might - * clobber r0-r3. */ - if (ins->compact_branch && ins->writeout) { - for (unsigned i = 0; i < ctx->temp_count; ++i) { - if (!live[i]) - continue; + /* Add blend shader interference: blend shaders might + * clobber r0-r3. */ + if (ins->compact_branch && ins->writeout) { + for (unsigned i = 0; i < ctx->temp_count; ++i) { + if (!live[i]) + continue; - for (unsigned j = 0; j < 4; j++) { - lcra_add_node_interference(l, ctx->temp_count + j, - 0xFFFF, - i, live[i]); - } - } - } + for (unsigned j = 0; j < 4; j++) { + lcra_add_node_interference(l, ctx->temp_count + j, 0xFFFF, i, + live[i]); + } + } + } - /* Update live_in */ - mir_liveness_ins_update(live, ins, ctx->temp_count); - } + /* Update live_in */ + mir_liveness_ins_update(live, ins, ctx->temp_count); + } - free(live); - } + free(live); + } } static bool mir_is_64(midgard_instruction *ins) { - if (nir_alu_type_get_type_size(ins->dest_type) == 64) - return true; + if (nir_alu_type_get_type_size(ins->dest_type) == 64) + return true; - mir_foreach_src(ins, v) { - if (nir_alu_type_get_type_size(ins->src_types[v]) == 64) - return true; - } + mir_foreach_src(ins, v) { + if (nir_alu_type_get_type_size(ins->src_types[v]) == 64) + return true; + } - return false; + return false; } /* @@ -449,7 +454,7 @@ mir_is_64(midgard_instruction *ins) static bool needs_contiguous_workgroup(compiler_context *ctx) { - return gl_shader_stage_uses_workgroup(ctx->stage); + return gl_shader_stage_uses_workgroup(ctx->stage); } /* @@ -461,13 +466,13 @@ needs_contiguous_workgroup(compiler_context *ctx) static unsigned max_threads_per_workgroup(compiler_context *ctx) { - if (ctx->nir->info.workgroup_size_variable) { - return 128; - } else { - return ctx->nir->info.workgroup_size[0] * - ctx->nir->info.workgroup_size[1] * - ctx->nir->info.workgroup_size[2]; - } + if (ctx->nir->info.workgroup_size_variable) { + return 128; + } else { + return ctx->nir->info.workgroup_size[0] * + ctx->nir->info.workgroup_size[1] * + ctx->nir->info.workgroup_size[2]; + } } /* @@ -488,21 +493,21 @@ max_threads_per_workgroup(compiler_context *ctx) static unsigned max_work_registers(compiler_context *ctx) { - if (ctx->inputs->is_blend) - return 8; + if (ctx->inputs->is_blend) + return 8; - unsigned rmu_vec4 = ctx->info->push.count / 4; - unsigned max_work_registers = (rmu_vec4 >= 8) ? (24 - rmu_vec4) : 16; + unsigned rmu_vec4 = ctx->info->push.count / 4; + unsigned max_work_registers = (rmu_vec4 >= 8) ? (24 - rmu_vec4) : 16; - if (needs_contiguous_workgroup(ctx)) { - unsigned threads = max_threads_per_workgroup(ctx); - assert(threads <= 128 && "maximum threads in ABI exceeded"); + if (needs_contiguous_workgroup(ctx)) { + unsigned threads = max_threads_per_workgroup(ctx); + assert(threads <= 128 && "maximum threads in ABI exceeded"); - if (threads > 64) - max_work_registers = MIN2(max_work_registers, 8); - } + if (threads > 64) + max_work_registers = MIN2(max_work_registers, 8); + } - return max_work_registers; + return max_work_registers; } /* This routine performs the actual register allocation. It should be succeeded @@ -511,689 +516,693 @@ max_work_registers(compiler_context *ctx) static struct lcra_state * allocate_registers(compiler_context *ctx, bool *spilled) { - int work_count = max_work_registers(ctx); + int work_count = max_work_registers(ctx); - /* No register allocation to do with no SSA */ + /* No register allocation to do with no SSA */ - if (!ctx->temp_count) - return NULL; + if (!ctx->temp_count) + return NULL; - /* Initialize LCRA. Allocate extra node at the end for r1-r3 for - * interference */ + /* Initialize LCRA. Allocate extra node at the end for r1-r3 for + * interference */ - struct lcra_state *l = lcra_alloc_equations(ctx->temp_count + 4, 5); - unsigned node_r1 = ctx->temp_count + 1; + struct lcra_state *l = lcra_alloc_equations(ctx->temp_count + 4, 5); + unsigned node_r1 = ctx->temp_count + 1; - /* Starts of classes, in bytes */ - l->class_start[REG_CLASS_WORK] = 16 * 0; - l->class_start[REG_CLASS_LDST] = 16 * 26; - l->class_start[REG_CLASS_TEXR] = 16 * 28; - l->class_start[REG_CLASS_TEXW] = 16 * 28; + /* Starts of classes, in bytes */ + l->class_start[REG_CLASS_WORK] = 16 * 0; + l->class_start[REG_CLASS_LDST] = 16 * 26; + l->class_start[REG_CLASS_TEXR] = 16 * 28; + l->class_start[REG_CLASS_TEXW] = 16 * 28; - l->class_size[REG_CLASS_WORK] = 16 * work_count; - l->class_size[REG_CLASS_LDST] = 16 * 2; - l->class_size[REG_CLASS_TEXR] = 16 * 2; - l->class_size[REG_CLASS_TEXW] = 16 * 2; + l->class_size[REG_CLASS_WORK] = 16 * work_count; + l->class_size[REG_CLASS_LDST] = 16 * 2; + l->class_size[REG_CLASS_TEXR] = 16 * 2; + l->class_size[REG_CLASS_TEXW] = 16 * 2; - lcra_set_disjoint_class(l, REG_CLASS_TEXR, REG_CLASS_TEXW); + lcra_set_disjoint_class(l, REG_CLASS_TEXR, REG_CLASS_TEXW); - /* To save space on T*20, we don't have real texture registers. - * Instead, tex inputs reuse the load/store pipeline registers, and - * tex outputs use work r0/r1. Note we still use TEXR/TEXW classes, - * noting that this handles interferences and sizes correctly. */ + /* To save space on T*20, we don't have real texture registers. + * Instead, tex inputs reuse the load/store pipeline registers, and + * tex outputs use work r0/r1. Note we still use TEXR/TEXW classes, + * noting that this handles interferences and sizes correctly. */ - if (ctx->quirks & MIDGARD_INTERPIPE_REG_ALIASING) { - l->class_start[REG_CLASS_TEXR] = l->class_start[REG_CLASS_LDST]; - l->class_start[REG_CLASS_TEXW] = l->class_start[REG_CLASS_WORK]; - } + if (ctx->quirks & MIDGARD_INTERPIPE_REG_ALIASING) { + l->class_start[REG_CLASS_TEXR] = l->class_start[REG_CLASS_LDST]; + l->class_start[REG_CLASS_TEXW] = l->class_start[REG_CLASS_WORK]; + } - unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count); - unsigned *min_alignment = calloc(sizeof(unsigned), ctx->temp_count); - unsigned *min_bound = calloc(sizeof(unsigned), ctx->temp_count); + unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count); + unsigned *min_alignment = calloc(sizeof(unsigned), ctx->temp_count); + unsigned *min_bound = calloc(sizeof(unsigned), ctx->temp_count); - mir_foreach_instr_global(ctx, ins) { - /* Swizzles of 32-bit sources on 64-bit instructions need to be - * aligned to either bottom (xy) or top (zw). More general - * swizzle lowering should happen prior to scheduling (TODO), - * but once we get RA we shouldn't disrupt this further. Align - * sources of 64-bit instructions. */ + mir_foreach_instr_global(ctx, ins) { + /* Swizzles of 32-bit sources on 64-bit instructions need to be + * aligned to either bottom (xy) or top (zw). More general + * swizzle lowering should happen prior to scheduling (TODO), + * but once we get RA we shouldn't disrupt this further. Align + * sources of 64-bit instructions. */ - if (ins->type == TAG_ALU_4 && mir_is_64(ins)) { - mir_foreach_src(ins, v) { - unsigned s = ins->src[v]; + if (ins->type == TAG_ALU_4 && mir_is_64(ins)) { + mir_foreach_src(ins, v) { + unsigned s = ins->src[v]; - if (s < ctx->temp_count) - min_alignment[s] = MAX2(3, min_alignment[s]); - } - } + if (s < ctx->temp_count) + min_alignment[s] = MAX2(3, min_alignment[s]); + } + } - if (ins->type == TAG_LOAD_STORE_4 && OP_HAS_ADDRESS(ins->op)) { - mir_foreach_src(ins, v) { - unsigned s = ins->src[v]; - unsigned size = nir_alu_type_get_type_size(ins->src_types[v]); + if (ins->type == TAG_LOAD_STORE_4 && OP_HAS_ADDRESS(ins->op)) { + mir_foreach_src(ins, v) { + unsigned s = ins->src[v]; + unsigned size = nir_alu_type_get_type_size(ins->src_types[v]); - if (s < ctx->temp_count) - min_alignment[s] = MAX2((size == 64) ? 3 : 2, min_alignment[s]); - } - } + if (s < ctx->temp_count) + min_alignment[s] = MAX2((size == 64) ? 3 : 2, min_alignment[s]); + } + } - /* Anything read as 16-bit needs proper alignment to ensure the - * resulting code can be packed. - */ - mir_foreach_src(ins, s) { - unsigned src_size = nir_alu_type_get_type_size(ins->src_types[s]); - if (src_size == 16 && ins->src[s] < SSA_FIXED_MINIMUM) - min_bound[ins->src[s]] = MAX2(min_bound[ins->src[s]], 8); - } + /* Anything read as 16-bit needs proper alignment to ensure the + * resulting code can be packed. + */ + mir_foreach_src(ins, s) { + unsigned src_size = nir_alu_type_get_type_size(ins->src_types[s]); + if (src_size == 16 && ins->src[s] < SSA_FIXED_MINIMUM) + min_bound[ins->src[s]] = MAX2(min_bound[ins->src[s]], 8); + } - /* Everything after this concerns only the destination, not the - * sources. - */ - if (ins->dest >= SSA_FIXED_MINIMUM) continue; + /* Everything after this concerns only the destination, not the + * sources. + */ + if (ins->dest >= SSA_FIXED_MINIMUM) + continue; - unsigned size = nir_alu_type_get_type_size(ins->dest_type); + unsigned size = nir_alu_type_get_type_size(ins->dest_type); - if (ins->is_pack) - size = 32; + if (ins->is_pack) + size = 32; - /* 0 for x, 1 for xy, 2 for xyz, 3 for xyzw */ - int comps1 = util_logbase2(ins->mask); + /* 0 for x, 1 for xy, 2 for xyz, 3 for xyzw */ + int comps1 = util_logbase2(ins->mask); - int bytes = (comps1 + 1) * (size / 8); + int bytes = (comps1 + 1) * (size / 8); - /* Use the largest class if there's ambiguity, this - * handles partial writes */ + /* Use the largest class if there's ambiguity, this + * handles partial writes */ - int dest = ins->dest; - found_class[dest] = MAX2(found_class[dest], bytes); + int dest = ins->dest; + found_class[dest] = MAX2(found_class[dest], bytes); - min_alignment[dest] = - MAX2(min_alignment[dest], - (size == 16) ? 1 : /* (1 << 1) = 2-byte */ - (size == 32) ? 2 : /* (1 << 2) = 4-byte */ - (size == 64) ? 3 : /* (1 << 3) = 8-byte */ - 3); /* 8-bit todo */ + min_alignment[dest] = + MAX2(min_alignment[dest], (size == 16) ? 1 : /* (1 << 1) = 2-byte */ + (size == 32) ? 2 + : /* (1 << 2) = 4-byte */ + (size == 64) ? 3 + : /* (1 << 3) = 8-byte */ + 3); /* 8-bit todo */ - /* We can't cross xy/zw boundaries. TODO: vec8 can */ - if (size == 16 && min_alignment[dest] != 4) - min_bound[dest] = 8; + /* We can't cross xy/zw boundaries. TODO: vec8 can */ + if (size == 16 && min_alignment[dest] != 4) + min_bound[dest] = 8; - /* We don't have a swizzle for the conditional and we don't - * want to muck with the conditional itself, so just force - * alignment for now */ + /* We don't have a swizzle for the conditional and we don't + * want to muck with the conditional itself, so just force + * alignment for now */ - if (ins->type == TAG_ALU_4 && OP_IS_CSEL_V(ins->op)) { - min_alignment[dest] = 4; /* 1 << 4= 16-byte = vec4 */ + if (ins->type == TAG_ALU_4 && OP_IS_CSEL_V(ins->op)) { + min_alignment[dest] = 4; /* 1 << 4= 16-byte = vec4 */ - /* LCRA assumes bound >= alignment */ - min_bound[dest] = 16; - } + /* LCRA assumes bound >= alignment */ + min_bound[dest] = 16; + } - /* Since ld/st swizzles and masks are 32-bit only, we need them - * aligned to enable final packing */ - if (ins->type == TAG_LOAD_STORE_4) - min_alignment[dest] = MAX2(min_alignment[dest], 2); - } + /* Since ld/st swizzles and masks are 32-bit only, we need them + * aligned to enable final packing */ + if (ins->type == TAG_LOAD_STORE_4) + min_alignment[dest] = MAX2(min_alignment[dest], 2); + } - for (unsigned i = 0; i < ctx->temp_count; ++i) { - lcra_set_alignment(l, i, min_alignment[i] ? min_alignment[i] : 2, - min_bound[i] ? min_bound[i] : 16); - lcra_restrict_range(l, i, found_class[i]); - } - - free(found_class); - free(min_alignment); - free(min_bound); + for (unsigned i = 0; i < ctx->temp_count; ++i) { + lcra_set_alignment(l, i, min_alignment[i] ? min_alignment[i] : 2, + min_bound[i] ? min_bound[i] : 16); + lcra_restrict_range(l, i, found_class[i]); + } - /* Next, we'll determine semantic class. We default to zero (work). - * But, if we're used with a special operation, that will force us to a - * particular class. Each node must be assigned to exactly one class; a - * prepass before RA should have lowered what-would-have-been - * multiclass nodes into a series of moves to break it up into multiple - * nodes (TODO) */ + free(found_class); + free(min_alignment); + free(min_bound); - mir_foreach_instr_global(ctx, ins) { - /* Check if this operation imposes any classes */ + /* Next, we'll determine semantic class. We default to zero (work). + * But, if we're used with a special operation, that will force us to a + * particular class. Each node must be assigned to exactly one class; a + * prepass before RA should have lowered what-would-have-been + * multiclass nodes into a series of moves to break it up into multiple + * nodes (TODO) */ - if (ins->type == TAG_LOAD_STORE_4) { - set_class(l->class, ins->src[0], REG_CLASS_LDST); - set_class(l->class, ins->src[1], REG_CLASS_LDST); - set_class(l->class, ins->src[2], REG_CLASS_LDST); - set_class(l->class, ins->src[3], REG_CLASS_LDST); + mir_foreach_instr_global(ctx, ins) { + /* Check if this operation imposes any classes */ - if (OP_IS_VEC4_ONLY(ins->op)) { - lcra_restrict_range(l, ins->dest, 16); - lcra_restrict_range(l, ins->src[0], 16); - lcra_restrict_range(l, ins->src[1], 16); - lcra_restrict_range(l, ins->src[2], 16); - lcra_restrict_range(l, ins->src[3], 16); - } - } else if (ins->type == TAG_TEXTURE_4) { - set_class(l->class, ins->dest, REG_CLASS_TEXW); - set_class(l->class, ins->src[0], REG_CLASS_TEXR); - set_class(l->class, ins->src[1], REG_CLASS_TEXR); - set_class(l->class, ins->src[2], REG_CLASS_TEXR); - set_class(l->class, ins->src[3], REG_CLASS_TEXR); - } - } + if (ins->type == TAG_LOAD_STORE_4) { + set_class(l->class, ins->src[0], REG_CLASS_LDST); + set_class(l->class, ins->src[1], REG_CLASS_LDST); + set_class(l->class, ins->src[2], REG_CLASS_LDST); + set_class(l->class, ins->src[3], REG_CLASS_LDST); - /* Check that the semantics of the class are respected */ - mir_foreach_instr_global(ctx, ins) { - assert(check_write_class(l->class, ins->type, ins->dest)); - assert(check_read_class(l->class, ins->type, ins->src[0])); - assert(check_read_class(l->class, ins->type, ins->src[1])); - assert(check_read_class(l->class, ins->type, ins->src[2])); - assert(check_read_class(l->class, ins->type, ins->src[3])); - } + if (OP_IS_VEC4_ONLY(ins->op)) { + lcra_restrict_range(l, ins->dest, 16); + lcra_restrict_range(l, ins->src[0], 16); + lcra_restrict_range(l, ins->src[1], 16); + lcra_restrict_range(l, ins->src[2], 16); + lcra_restrict_range(l, ins->src[3], 16); + } + } else if (ins->type == TAG_TEXTURE_4) { + set_class(l->class, ins->dest, REG_CLASS_TEXW); + set_class(l->class, ins->src[0], REG_CLASS_TEXR); + set_class(l->class, ins->src[1], REG_CLASS_TEXR); + set_class(l->class, ins->src[2], REG_CLASS_TEXR); + set_class(l->class, ins->src[3], REG_CLASS_TEXR); + } + } - /* Mark writeout to r0, depth to r1.x, stencil to r1.y, - * render target to r1.z, unknown to r1.w */ - mir_foreach_instr_global(ctx, ins) { - if (!(ins->compact_branch && ins->writeout)) continue; + /* Check that the semantics of the class are respected */ + mir_foreach_instr_global(ctx, ins) { + assert(check_write_class(l->class, ins->type, ins->dest)); + assert(check_read_class(l->class, ins->type, ins->src[0])); + assert(check_read_class(l->class, ins->type, ins->src[1])); + assert(check_read_class(l->class, ins->type, ins->src[2])); + assert(check_read_class(l->class, ins->type, ins->src[3])); + } - if (ins->src[0] < ctx->temp_count) - l->solutions[ins->src[0]] = 0; + /* Mark writeout to r0, depth to r1.x, stencil to r1.y, + * render target to r1.z, unknown to r1.w */ + mir_foreach_instr_global(ctx, ins) { + if (!(ins->compact_branch && ins->writeout)) + continue; - if (ins->src[2] < ctx->temp_count) - l->solutions[ins->src[2]] = (16 * 1) + COMPONENT_X * 4; + if (ins->src[0] < ctx->temp_count) + l->solutions[ins->src[0]] = 0; - if (ins->src[3] < ctx->temp_count) - l->solutions[ins->src[3]] = (16 * 1) + COMPONENT_Y * 4; + if (ins->src[2] < ctx->temp_count) + l->solutions[ins->src[2]] = (16 * 1) + COMPONENT_X * 4; - if (ins->src[1] < ctx->temp_count) - l->solutions[ins->src[1]] = (16 * 1) + COMPONENT_Z * 4; + if (ins->src[3] < ctx->temp_count) + l->solutions[ins->src[3]] = (16 * 1) + COMPONENT_Y * 4; - if (ins->dest < ctx->temp_count) - l->solutions[ins->dest] = (16 * 1) + COMPONENT_W * 4; - } + if (ins->src[1] < ctx->temp_count) + l->solutions[ins->src[1]] = (16 * 1) + COMPONENT_Z * 4; - /* Destinations of instructions in a writeout block cannot be assigned - * to r1 unless they are actually used as r1 from the writeout itself, - * since the writes to r1 are special. A code sequence like: - * - * sadd.fmov r1.x, [...] - * vadd.fadd r0, r1, r2 - * [writeout branch] - * - * will misbehave since the r1.x write will be interpreted as a - * gl_FragDepth write so it won't show up correctly when r1 is read in - * the following segment. We model this as interference. - */ + if (ins->dest < ctx->temp_count) + l->solutions[ins->dest] = (16 * 1) + COMPONENT_W * 4; + } - for (unsigned i = 0; i < 4; ++i) - l->solutions[ctx->temp_count + i] = (16 * i); + /* Destinations of instructions in a writeout block cannot be assigned + * to r1 unless they are actually used as r1 from the writeout itself, + * since the writes to r1 are special. A code sequence like: + * + * sadd.fmov r1.x, [...] + * vadd.fadd r0, r1, r2 + * [writeout branch] + * + * will misbehave since the r1.x write will be interpreted as a + * gl_FragDepth write so it won't show up correctly when r1 is read in + * the following segment. We model this as interference. + */ - mir_foreach_block(ctx, _blk) { - midgard_block *blk = (midgard_block *) _blk; + for (unsigned i = 0; i < 4; ++i) + l->solutions[ctx->temp_count + i] = (16 * i); - mir_foreach_bundle_in_block(blk, v) { - /* We need at least a writeout and nonwriteout instruction */ - if (v->instruction_count < 2) - continue; + mir_foreach_block(ctx, _blk) { + midgard_block *blk = (midgard_block *)_blk; - /* Branches always come at the end */ - midgard_instruction *br = v->instructions[v->instruction_count - 1]; + mir_foreach_bundle_in_block(blk, v) { + /* We need at least a writeout and nonwriteout instruction */ + if (v->instruction_count < 2) + continue; - if (!br->writeout) - continue; + /* Branches always come at the end */ + midgard_instruction *br = v->instructions[v->instruction_count - 1]; - for (signed i = v->instruction_count - 2; i >= 0; --i) { - midgard_instruction *ins = v->instructions[i]; + if (!br->writeout) + continue; - if (ins->dest >= ctx->temp_count) - continue; + for (signed i = v->instruction_count - 2; i >= 0; --i) { + midgard_instruction *ins = v->instructions[i]; - bool used_as_r1 = (br->dest == ins->dest); + if (ins->dest >= ctx->temp_count) + continue; - mir_foreach_src(br, s) - used_as_r1 |= (s > 0) && (br->src[s] == ins->dest); + bool used_as_r1 = (br->dest == ins->dest); - if (!used_as_r1) - lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), node_r1, 0xFFFF); - } - } - } + mir_foreach_src(br, s) + used_as_r1 |= (s > 0) && (br->src[s] == ins->dest); - /* Precolour blend input to r0. Note writeout is necessarily at the end - * and blend shaders are single-RT only so there is only a single - * writeout block, so this cannot conflict with the writeout r0 (there - * is no need to have an intermediate move) */ + if (!used_as_r1) + lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), + node_r1, 0xFFFF); + } + } + } - if (ctx->blend_input != ~0) { - assert(ctx->blend_input < ctx->temp_count); - l->solutions[ctx->blend_input] = 0; - } + /* Precolour blend input to r0. Note writeout is necessarily at the end + * and blend shaders are single-RT only so there is only a single + * writeout block, so this cannot conflict with the writeout r0 (there + * is no need to have an intermediate move) */ - /* Same for the dual-source blend input/output, except here we use r2, - * which is also set in the fragment shader. */ + if (ctx->blend_input != ~0) { + assert(ctx->blend_input < ctx->temp_count); + l->solutions[ctx->blend_input] = 0; + } - if (ctx->blend_src1 != ~0) { - assert(ctx->blend_src1 < ctx->temp_count); - l->solutions[ctx->blend_src1] = (16 * 2); - ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, 3); - } + /* Same for the dual-source blend input/output, except here we use r2, + * which is also set in the fragment shader. */ - mir_compute_interference(ctx, l); + if (ctx->blend_src1 != ~0) { + assert(ctx->blend_src1 < ctx->temp_count); + l->solutions[ctx->blend_src1] = (16 * 2); + ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, 3); + } - *spilled = !lcra_solve(l); - return l; + mir_compute_interference(ctx, l); + + *spilled = !lcra_solve(l); + return l; } - /* Once registers have been decided via register allocation * (allocate_registers), we need to rewrite the MIR to use registers instead of * indices */ static void -install_registers_instr( - compiler_context *ctx, - struct lcra_state *l, - midgard_instruction *ins) +install_registers_instr(compiler_context *ctx, struct lcra_state *l, + midgard_instruction *ins) { - unsigned src_shift[MIR_SRC_COUNT]; + unsigned src_shift[MIR_SRC_COUNT]; - for (unsigned i = 0; i < MIR_SRC_COUNT; ++i) { - src_shift[i] = - util_logbase2(nir_alu_type_get_type_size(ins->src_types[i]) / 8); - } + for (unsigned i = 0; i < MIR_SRC_COUNT; ++i) { + src_shift[i] = + util_logbase2(nir_alu_type_get_type_size(ins->src_types[i]) / 8); + } - unsigned dest_shift = - util_logbase2(nir_alu_type_get_type_size(ins->dest_type) / 8); + unsigned dest_shift = + util_logbase2(nir_alu_type_get_type_size(ins->dest_type) / 8); - switch (ins->type) { - case TAG_ALU_4: - case TAG_ALU_8: - case TAG_ALU_12: - case TAG_ALU_16: { - if (ins->compact_branch) - return; + switch (ins->type) { + case TAG_ALU_4: + case TAG_ALU_8: + case TAG_ALU_12: + case TAG_ALU_16: { + if (ins->compact_branch) + return; - struct phys_reg src1 = index_to_reg(ctx, l, ins->src[0], src_shift[0]); - struct phys_reg src2 = index_to_reg(ctx, l, ins->src[1], src_shift[1]); - struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift); + struct phys_reg src1 = index_to_reg(ctx, l, ins->src[0], src_shift[0]); + struct phys_reg src2 = index_to_reg(ctx, l, ins->src[1], src_shift[1]); + struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift); - mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); + mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); - unsigned dest_offset = - GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props) ? 0 : - dest.offset; + unsigned dest_offset = + GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props) ? 0 : dest.offset; - offset_swizzle(ins->swizzle[0], src1.offset, src1.shift, dest.shift, dest_offset); - if (!ins->has_inline_constant) - offset_swizzle(ins->swizzle[1], src2.offset, src2.shift, dest.shift, dest_offset); - if (ins->src[0] != ~0) - ins->src[0] = SSA_FIXED_REGISTER(src1.reg); - if (ins->src[1] != ~0) - ins->src[1] = SSA_FIXED_REGISTER(src2.reg); - if (ins->dest != ~0) - ins->dest = SSA_FIXED_REGISTER(dest.reg); - break; - } + offset_swizzle(ins->swizzle[0], src1.offset, src1.shift, dest.shift, + dest_offset); + if (!ins->has_inline_constant) + offset_swizzle(ins->swizzle[1], src2.offset, src2.shift, dest.shift, + dest_offset); + if (ins->src[0] != ~0) + ins->src[0] = SSA_FIXED_REGISTER(src1.reg); + if (ins->src[1] != ~0) + ins->src[1] = SSA_FIXED_REGISTER(src2.reg); + if (ins->dest != ~0) + ins->dest = SSA_FIXED_REGISTER(dest.reg); + break; + } - case TAG_LOAD_STORE_4: { - /* Which physical register we read off depends on - * whether we are loading or storing -- think about the - * logical dataflow */ + case TAG_LOAD_STORE_4: { + /* Which physical register we read off depends on + * whether we are loading or storing -- think about the + * logical dataflow */ - bool encodes_src = OP_IS_STORE(ins->op); + bool encodes_src = OP_IS_STORE(ins->op); - if (encodes_src) { - struct phys_reg src = index_to_reg(ctx, l, ins->src[0], src_shift[0]); - assert(src.reg == 26 || src.reg == 27); + if (encodes_src) { + struct phys_reg src = index_to_reg(ctx, l, ins->src[0], src_shift[0]); + assert(src.reg == 26 || src.reg == 27); - ins->src[0] = SSA_FIXED_REGISTER(src.reg); - offset_swizzle(ins->swizzle[0], src.offset, src.shift, 0, 0); - } else { - struct phys_reg dst = index_to_reg(ctx, l, ins->dest, dest_shift); + ins->src[0] = SSA_FIXED_REGISTER(src.reg); + offset_swizzle(ins->swizzle[0], src.offset, src.shift, 0, 0); + } else { + struct phys_reg dst = index_to_reg(ctx, l, ins->dest, dest_shift); - ins->dest = SSA_FIXED_REGISTER(dst.reg); - offset_swizzle(ins->swizzle[0], 0, 2, dest_shift, dst.offset); - mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset); - } + ins->dest = SSA_FIXED_REGISTER(dst.reg); + offset_swizzle(ins->swizzle[0], 0, 2, dest_shift, dst.offset); + mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset); + } - /* We also follow up by actual arguments */ + /* We also follow up by actual arguments */ - for (int i = 1; i <= 3; i++) { - unsigned src_index = ins->src[i]; - if (src_index != ~0) { - struct phys_reg src = index_to_reg(ctx, l, src_index, src_shift[i]); - unsigned component = src.offset >> src.shift; - assert(component << src.shift == src.offset); - ins->src[i] = SSA_FIXED_REGISTER(src.reg); - ins->swizzle[i][0] += component; - } - } + for (int i = 1; i <= 3; i++) { + unsigned src_index = ins->src[i]; + if (src_index != ~0) { + struct phys_reg src = index_to_reg(ctx, l, src_index, src_shift[i]); + unsigned component = src.offset >> src.shift; + assert(component << src.shift == src.offset); + ins->src[i] = SSA_FIXED_REGISTER(src.reg); + ins->swizzle[i][0] += component; + } + } - break; - } + break; + } - case TAG_TEXTURE_4: { - if (ins->op == midgard_tex_op_barrier) - break; + case TAG_TEXTURE_4: { + if (ins->op == midgard_tex_op_barrier) + break; - /* Grab RA results */ - struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift); - struct phys_reg coord = index_to_reg(ctx, l, ins->src[1], src_shift[1]); - struct phys_reg lod = index_to_reg(ctx, l, ins->src[2], src_shift[2]); - struct phys_reg offset = index_to_reg(ctx, l, ins->src[3], src_shift[3]); + /* Grab RA results */ + struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift); + struct phys_reg coord = index_to_reg(ctx, l, ins->src[1], src_shift[1]); + struct phys_reg lod = index_to_reg(ctx, l, ins->src[2], src_shift[2]); + struct phys_reg offset = index_to_reg(ctx, l, ins->src[3], src_shift[3]); - /* First, install the texture coordinate */ - if (ins->src[1] != ~0) - ins->src[1] = SSA_FIXED_REGISTER(coord.reg); - offset_swizzle(ins->swizzle[1], coord.offset, coord.shift, dest.shift, 0); + /* First, install the texture coordinate */ + if (ins->src[1] != ~0) + ins->src[1] = SSA_FIXED_REGISTER(coord.reg); + offset_swizzle(ins->swizzle[1], coord.offset, coord.shift, dest.shift, 0); - /* Next, install the destination */ - if (ins->dest != ~0) - ins->dest = SSA_FIXED_REGISTER(dest.reg); - offset_swizzle(ins->swizzle[0], 0, 2, dest.shift, - dest_shift == 1 ? dest.offset % 8 : - dest.offset); - mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); + /* Next, install the destination */ + if (ins->dest != ~0) + ins->dest = SSA_FIXED_REGISTER(dest.reg); + offset_swizzle(ins->swizzle[0], 0, 2, dest.shift, + dest_shift == 1 ? dest.offset % 8 : dest.offset); + mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); - /* If there is a register LOD/bias, use it */ - if (ins->src[2] != ~0) { - assert(!(lod.offset & 3)); - ins->src[2] = SSA_FIXED_REGISTER(lod.reg); - ins->swizzle[2][0] = lod.offset / 4; - } + /* If there is a register LOD/bias, use it */ + if (ins->src[2] != ~0) { + assert(!(lod.offset & 3)); + ins->src[2] = SSA_FIXED_REGISTER(lod.reg); + ins->swizzle[2][0] = lod.offset / 4; + } - /* If there is an offset register, install it */ - if (ins->src[3] != ~0) { - ins->src[3] = SSA_FIXED_REGISTER(offset.reg); - ins->swizzle[3][0] = offset.offset / 4; - } + /* If there is an offset register, install it */ + if (ins->src[3] != ~0) { + ins->src[3] = SSA_FIXED_REGISTER(offset.reg); + ins->swizzle[3][0] = offset.offset / 4; + } - break; - } + break; + } - default: - break; - } + default: + break; + } } static void install_registers(compiler_context *ctx, struct lcra_state *l) { - mir_foreach_instr_global(ctx, ins) - install_registers_instr(ctx, l, ins); + mir_foreach_instr_global(ctx, ins) + install_registers_instr(ctx, l, ins); } - /* If register allocation fails, find the best spill node */ static signed -mir_choose_spill_node( - compiler_context *ctx, - struct lcra_state *l) +mir_choose_spill_node(compiler_context *ctx, struct lcra_state *l) { - /* We can't spill a previously spilled value or an unspill */ + /* We can't spill a previously spilled value or an unspill */ - mir_foreach_instr_global(ctx, ins) { - if (ins->no_spill & (1 << l->spill_class)) { - lcra_set_node_spill_cost(l, ins->dest, -1); + mir_foreach_instr_global(ctx, ins) { + if (ins->no_spill & (1 << l->spill_class)) { + lcra_set_node_spill_cost(l, ins->dest, -1); - if (l->spill_class != REG_CLASS_WORK) { - mir_foreach_src(ins, s) - lcra_set_node_spill_cost(l, ins->src[s], -1); - } - } - } + if (l->spill_class != REG_CLASS_WORK) { + mir_foreach_src(ins, s) + lcra_set_node_spill_cost(l, ins->src[s], -1); + } + } + } - return lcra_get_best_spill_node(l); + return lcra_get_best_spill_node(l); } /* Once we've chosen a spill node, spill it */ static void -mir_spill_register( - compiler_context *ctx, - unsigned spill_node, - unsigned spill_class, - unsigned *spill_count) +mir_spill_register(compiler_context *ctx, unsigned spill_node, + unsigned spill_class, unsigned *spill_count) { - if (spill_class == REG_CLASS_WORK && ctx->inputs->is_blend) - unreachable("Blend shader spilling is currently unimplemented"); + if (spill_class == REG_CLASS_WORK && ctx->inputs->is_blend) + unreachable("Blend shader spilling is currently unimplemented"); - unsigned spill_index = ctx->temp_count; + unsigned spill_index = ctx->temp_count; - /* We have a spill node, so check the class. Work registers - * legitimately spill to TLS, but special registers just spill to work - * registers */ + /* We have a spill node, so check the class. Work registers + * legitimately spill to TLS, but special registers just spill to work + * registers */ - bool is_special = spill_class != REG_CLASS_WORK; - bool is_special_w = spill_class == REG_CLASS_TEXW; + bool is_special = spill_class != REG_CLASS_WORK; + bool is_special_w = spill_class == REG_CLASS_TEXW; - /* Allocate TLS slot (maybe) */ - unsigned spill_slot = !is_special ? (*spill_count)++ : 0; + /* Allocate TLS slot (maybe) */ + unsigned spill_slot = !is_special ? (*spill_count)++ : 0; - /* For special reads, figure out how many bytes we need */ - unsigned read_bytemask = 0; + /* For special reads, figure out how many bytes we need */ + unsigned read_bytemask = 0; - /* If multiple instructions write to this destination, we'll have to - * fill from TLS before writing */ - unsigned write_count = 0; + /* If multiple instructions write to this destination, we'll have to + * fill from TLS before writing */ + unsigned write_count = 0; - mir_foreach_instr_global_safe(ctx, ins) { - read_bytemask |= mir_bytemask_of_read_components(ins, spill_node); - if (ins->dest == spill_node) - ++write_count; - } + mir_foreach_instr_global_safe(ctx, ins) { + read_bytemask |= mir_bytemask_of_read_components(ins, spill_node); + if (ins->dest == spill_node) + ++write_count; + } - /* For TLS, replace all stores to the spilled node. For - * special reads, just keep as-is; the class will be demoted - * implicitly. For special writes, spill to a work register */ + /* For TLS, replace all stores to the spilled node. For + * special reads, just keep as-is; the class will be demoted + * implicitly. For special writes, spill to a work register */ - if (!is_special || is_special_w) { - if (is_special_w) - spill_slot = spill_index++; + if (!is_special || is_special_w) { + if (is_special_w) + spill_slot = spill_index++; - unsigned last_id = ~0; - unsigned last_fill = ~0; - unsigned last_spill_index = ~0; - midgard_instruction *last_spill = NULL; + unsigned last_id = ~0; + unsigned last_fill = ~0; + unsigned last_spill_index = ~0; + midgard_instruction *last_spill = NULL; - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->dest != spill_node) continue; + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->dest != spill_node) + continue; - /* Note: it's important to match the mask of the spill - * with the mask of the instruction whose destination - * we're spilling, or otherwise we'll read invalid - * components and can fail RA in a subsequent iteration - */ + /* Note: it's important to match the mask of the spill + * with the mask of the instruction whose destination + * we're spilling, or otherwise we'll read invalid + * components and can fail RA in a subsequent iteration + */ - if (is_special_w) { - midgard_instruction st = v_mov(spill_node, spill_slot); - st.no_spill |= (1 << spill_class); - st.mask = ins->mask; - st.dest_type = st.src_types[1] = ins->dest_type; + if (is_special_w) { + midgard_instruction st = v_mov(spill_node, spill_slot); + st.no_spill |= (1 << spill_class); + st.mask = ins->mask; + st.dest_type = st.src_types[1] = ins->dest_type; - /* Hint: don't rewrite this node */ - st.hint = true; + /* Hint: don't rewrite this node */ + st.hint = true; - mir_insert_instruction_after_scheduled(ctx, block, ins, st); - } else { - unsigned bundle = ins->bundle_id; - unsigned dest = (bundle == last_id)? last_spill_index : spill_index++; + mir_insert_instruction_after_scheduled(ctx, block, ins, st); + } else { + unsigned bundle = ins->bundle_id; + unsigned dest = + (bundle == last_id) ? last_spill_index : spill_index++; - unsigned bytemask = mir_bytemask(ins); - unsigned write_mask = mir_from_bytemask(mir_round_bytemask_up( - bytemask, 32), 32); + unsigned bytemask = mir_bytemask(ins); + unsigned write_mask = + mir_from_bytemask(mir_round_bytemask_up(bytemask, 32), 32); - if (write_count > 1 && bytemask != 0xFFFF && bundle != last_fill) { - midgard_instruction read = - v_load_store_scratch(dest, spill_slot, false, 0xF); - mir_insert_instruction_before_scheduled(ctx, block, ins, read); - write_mask = 0xF; - last_fill = bundle; - } + if (write_count > 1 && bytemask != 0xFFFF && + bundle != last_fill) { + midgard_instruction read = + v_load_store_scratch(dest, spill_slot, false, 0xF); + mir_insert_instruction_before_scheduled(ctx, block, ins, + read); + write_mask = 0xF; + last_fill = bundle; + } - ins->dest = dest; - ins->no_spill |= (1 << spill_class); + ins->dest = dest; + ins->no_spill |= (1 << spill_class); - bool move = false; + bool move = false; - /* In the same bundle, reads of the destination - * of the spilt instruction need to be direct */ - midgard_instruction *it = ins; - while ((it = list_first_entry(&it->link, midgard_instruction, link)) - && (it->bundle_id == bundle)) { + /* In the same bundle, reads of the destination + * of the spilt instruction need to be direct */ + midgard_instruction *it = ins; + while ((it = list_first_entry(&it->link, midgard_instruction, + link)) && + (it->bundle_id == bundle)) { - if (!mir_has_arg(it, spill_node)) continue; + if (!mir_has_arg(it, spill_node)) + continue; - mir_rewrite_index_src_single(it, spill_node, dest); + mir_rewrite_index_src_single(it, spill_node, dest); - /* The spilt instruction will write to - * a work register for `it` to read but - * the spill needs an LD/ST register */ - move = true; - } + /* The spilt instruction will write to + * a work register for `it` to read but + * the spill needs an LD/ST register */ + move = true; + } - if (move) - dest = spill_index++; + if (move) + dest = spill_index++; - if (last_id == bundle) { - last_spill->mask |= write_mask; - u_foreach_bit(c, write_mask) - last_spill->swizzle[0][c] = c; - } else { - midgard_instruction st = - v_load_store_scratch(dest, spill_slot, true, write_mask); - last_spill = mir_insert_instruction_after_scheduled(ctx, block, ins, st); - } + if (last_id == bundle) { + last_spill->mask |= write_mask; + u_foreach_bit(c, write_mask) + last_spill->swizzle[0][c] = c; + } else { + midgard_instruction st = + v_load_store_scratch(dest, spill_slot, true, write_mask); + last_spill = mir_insert_instruction_after_scheduled( + ctx, block, ins, st); + } - if (move) { - midgard_instruction mv = v_mov(ins->dest, dest); - mv.no_spill |= (1 << spill_class); + if (move) { + midgard_instruction mv = v_mov(ins->dest, dest); + mv.no_spill |= (1 << spill_class); - mir_insert_instruction_after_scheduled(ctx, block, ins, mv); - } + mir_insert_instruction_after_scheduled(ctx, block, ins, mv); + } - last_id = bundle; - last_spill_index = ins->dest; - } + last_id = bundle; + last_spill_index = ins->dest; + } - if (!is_special) - ctx->spills++; - } - } - } + if (!is_special) + ctx->spills++; + } + } + } - /* Insert a load from TLS before the first consecutive - * use of the node, rewriting to use spilled indices to - * break up the live range. Or, for special, insert a - * move. Ironically the latter *increases* register - * pressure, but the two uses of the spilling mechanism - * are somewhat orthogonal. (special spilling is to use - * work registers to back special registers; TLS - * spilling is to use memory to back work registers) */ + /* Insert a load from TLS before the first consecutive + * use of the node, rewriting to use spilled indices to + * break up the live range. Or, for special, insert a + * move. Ironically the latter *increases* register + * pressure, but the two uses of the spilling mechanism + * are somewhat orthogonal. (special spilling is to use + * work registers to back special registers; TLS + * spilling is to use memory to back work registers) */ - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - mir_foreach_instr_in_block(block, ins) { - /* We can't rewrite the moves used to spill in the - * first place. These moves are hinted. */ - if (ins->hint) continue; + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + mir_foreach_instr_in_block(block, ins) { + /* We can't rewrite the moves used to spill in the + * first place. These moves are hinted. */ + if (ins->hint) + continue; - /* If we don't use the spilled value, nothing to do */ - if (!mir_has_arg(ins, spill_node)) continue; + /* If we don't use the spilled value, nothing to do */ + if (!mir_has_arg(ins, spill_node)) + continue; - unsigned index = 0; + unsigned index = 0; - if (!is_special_w) { - index = ++spill_index; + if (!is_special_w) { + index = ++spill_index; - midgard_instruction *before = ins; - midgard_instruction st; + midgard_instruction *before = ins; + midgard_instruction st; - if (is_special) { - /* Move */ - st = v_mov(spill_node, index); - st.no_spill |= (1 << spill_class); - } else { - /* TLS load */ - st = v_load_store_scratch(index, spill_slot, false, 0xF); - } + if (is_special) { + /* Move */ + st = v_mov(spill_node, index); + st.no_spill |= (1 << spill_class); + } else { + /* TLS load */ + st = v_load_store_scratch(index, spill_slot, false, 0xF); + } - /* Mask the load based on the component count - * actually needed to prevent RA loops */ + /* Mask the load based on the component count + * actually needed to prevent RA loops */ - st.mask = mir_from_bytemask(mir_round_bytemask_up( - read_bytemask, 32), 32); + st.mask = + mir_from_bytemask(mir_round_bytemask_up(read_bytemask, 32), 32); - mir_insert_instruction_before_scheduled(ctx, block, before, st); - } else { - /* Special writes already have their move spilled in */ - index = spill_slot; - } + mir_insert_instruction_before_scheduled(ctx, block, before, st); + } else { + /* Special writes already have their move spilled in */ + index = spill_slot; + } + /* Rewrite to use */ + mir_rewrite_index_src_single(ins, spill_node, index); - /* Rewrite to use */ - mir_rewrite_index_src_single(ins, spill_node, index); + if (!is_special) + ctx->fills++; + } + } - if (!is_special) - ctx->fills++; - } - } + /* Reset hints */ - /* Reset hints */ - - mir_foreach_instr_global(ctx, ins) { - ins->hint = false; - } + mir_foreach_instr_global(ctx, ins) { + ins->hint = false; + } } static void mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) { - unsigned uniforms = ctx->info->push.count / 4; - unsigned old_work_count = 16 - MAX2(uniforms - 8, 0); - unsigned work_count = 16 - MAX2((new_cutoff - 8), 0); + unsigned uniforms = ctx->info->push.count / 4; + unsigned old_work_count = 16 - MAX2(uniforms - 8, 0); + unsigned work_count = 16 - MAX2((new_cutoff - 8), 0); - unsigned min_demote = SSA_FIXED_REGISTER(old_work_count); - unsigned max_demote = SSA_FIXED_REGISTER(work_count); + unsigned min_demote = SSA_FIXED_REGISTER(old_work_count); + unsigned max_demote = SSA_FIXED_REGISTER(work_count); - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - mir_foreach_instr_in_block(block, ins) { - mir_foreach_src(ins, i) { - if (ins->src[i] < min_demote || ins->src[i] >= max_demote) - continue; + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + mir_foreach_instr_in_block(block, ins) { + mir_foreach_src(ins, i) { + if (ins->src[i] < min_demote || ins->src[i] >= max_demote) + continue; - midgard_instruction *before = ins; + midgard_instruction *before = ins; - unsigned temp = make_compiler_temp(ctx); - unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4; - assert(idx < ctx->info->push.count); + unsigned temp = make_compiler_temp(ctx); + unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4; + assert(idx < ctx->info->push.count); - ctx->ubo_mask |= BITSET_BIT(ctx->info->push.words[idx].ubo); + ctx->ubo_mask |= BITSET_BIT(ctx->info->push.words[idx].ubo); - midgard_instruction ld = { - .type = TAG_LOAD_STORE_4, - .mask = 0xF, - .dest = temp, - .dest_type = ins->src_types[i], - .src = { ~0, ~0, ~0, ~0 }, - .swizzle = SWIZZLE_IDENTITY_4, - .op = midgard_op_ld_ubo_128, - .load_store = { - .index_reg = REGISTER_LDST_ZERO, - }, - .constants.u32[0] = ctx->info->push.words[idx].offset, - }; + midgard_instruction ld = { + .type = TAG_LOAD_STORE_4, + .mask = 0xF, + .dest = temp, + .dest_type = ins->src_types[i], + .src = {~0, ~0, ~0, ~0}, + .swizzle = SWIZZLE_IDENTITY_4, + .op = midgard_op_ld_ubo_128, + .load_store = + { + .index_reg = REGISTER_LDST_ZERO, + }, + .constants.u32[0] = ctx->info->push.words[idx].offset, + }; - midgard_pack_ubo_index_imm(&ld.load_store, - ctx->info->push.words[idx].ubo); + midgard_pack_ubo_index_imm(&ld.load_store, + ctx->info->push.words[idx].ubo); - mir_insert_instruction_before_scheduled(ctx, block, before, ld); + mir_insert_instruction_before_scheduled(ctx, block, before, ld); - mir_rewrite_index_src_single(ins, ins->src[i], temp); - } - } - } + mir_rewrite_index_src_single(ins, ins->src[i], temp); + } + } + } - ctx->info->push.count = MIN2(ctx->info->push.count, new_cutoff * 4); + ctx->info->push.count = MIN2(ctx->info->push.count, new_cutoff * 4); } /* Run register allocation in a loop, spilling until we succeed */ @@ -1201,56 +1210,57 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) void mir_ra(compiler_context *ctx) { - struct lcra_state *l = NULL; - bool spilled = false; - int iter_count = 1000; /* max iterations */ + struct lcra_state *l = NULL; + bool spilled = false; + int iter_count = 1000; /* max iterations */ - /* Number of 128-bit slots in memory we've spilled into */ - unsigned spill_count = DIV_ROUND_UP(ctx->info->tls_size, 16); + /* Number of 128-bit slots in memory we've spilled into */ + unsigned spill_count = DIV_ROUND_UP(ctx->info->tls_size, 16); + mir_create_pipeline_registers(ctx); - mir_create_pipeline_registers(ctx); + do { + if (spilled) { + signed spill_node = mir_choose_spill_node(ctx, l); + unsigned uniforms = ctx->info->push.count / 4; - do { - if (spilled) { - signed spill_node = mir_choose_spill_node(ctx, l); - unsigned uniforms = ctx->info->push.count / 4; + /* It's a lot cheaper to demote uniforms to get more + * work registers than to spill to TLS. */ + if (l->spill_class == REG_CLASS_WORK && uniforms > 8) { + mir_demote_uniforms(ctx, MAX2(uniforms - 4, 8)); + } else if (spill_node == -1) { + fprintf(stderr, "ERROR: Failed to choose spill node\n"); + lcra_free(l); + return; + } else { + mir_spill_register(ctx, spill_node, l->spill_class, &spill_count); + } + } - /* It's a lot cheaper to demote uniforms to get more - * work registers than to spill to TLS. */ - if (l->spill_class == REG_CLASS_WORK && uniforms > 8) { - mir_demote_uniforms(ctx, MAX2(uniforms - 4, 8)); - } else if (spill_node == -1) { - fprintf(stderr, "ERROR: Failed to choose spill node\n"); - lcra_free(l); - return; - } else { - mir_spill_register(ctx, spill_node, l->spill_class, &spill_count); - } - } + mir_squeeze_index(ctx); + mir_invalidate_liveness(ctx); - mir_squeeze_index(ctx); - mir_invalidate_liveness(ctx); + if (l) { + lcra_free(l); + l = NULL; + } - if (l) { - lcra_free(l); - l = NULL; - } + l = allocate_registers(ctx, &spilled); + } while (spilled && ((iter_count--) > 0)); - l = allocate_registers(ctx, &spilled); - } while(spilled && ((iter_count--) > 0)); + if (iter_count <= 0) { + fprintf( + stderr, + "panfrost: Gave up allocating registers, rendering will be incomplete\n"); + assert(0); + } - if (iter_count <= 0) { - fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n"); - assert(0); - } + /* Report spilling information. spill_count is in 128-bit slots (vec4 x + * fp32), but tls_size is in bytes, so multiply by 16 */ - /* Report spilling information. spill_count is in 128-bit slots (vec4 x - * fp32), but tls_size is in bytes, so multiply by 16 */ + ctx->info->tls_size = spill_count * 16; - ctx->info->tls_size = spill_count * 16; + install_registers(ctx, l); - install_registers(ctx, l); - - lcra_free(l); + lcra_free(l); } diff --git a/src/panfrost/midgard/midgard_ra_pipeline.c b/src/panfrost/midgard/midgard_ra_pipeline.c index 6f21ee1a699..3b0d07f2966 100644 --- a/src/panfrost/midgard/midgard_ra_pipeline.c +++ b/src/panfrost/midgard/midgard_ra_pipeline.c @@ -39,106 +39,108 @@ */ static bool -mir_pipeline_ins( - compiler_context *ctx, - midgard_block *block, - midgard_bundle *bundle, unsigned i, - unsigned pipeline_count) +mir_pipeline_ins(compiler_context *ctx, midgard_block *block, + midgard_bundle *bundle, unsigned i, unsigned pipeline_count) { - midgard_instruction *ins = bundle->instructions[i]; + midgard_instruction *ins = bundle->instructions[i]; - /* Our goal is to create a pipeline register. Pipeline registers are - * created at the start of the bundle and are destroyed at the end. So - * we conservatively require: - * - * 1. Each component read in the second stage is written in the first stage. - * 2. The index is not live after the bundle. - * 3. We're not a special index (writeout, conditionals, ..) - * - * Rationale: #1 ensures that there is no need to go before the - * creation of the bundle, so the pipeline register can exist. #2 is - * since the pipeline register will be destroyed at the end. This - * ensures that nothing will try to read/write the pipeline register - * once it is not live, and that there's no need to go earlier. */ + /* Our goal is to create a pipeline register. Pipeline registers are + * created at the start of the bundle and are destroyed at the end. So + * we conservatively require: + * + * 1. Each component read in the second stage is written in the first stage. + * 2. The index is not live after the bundle. + * 3. We're not a special index (writeout, conditionals, ..) + * + * Rationale: #1 ensures that there is no need to go before the + * creation of the bundle, so the pipeline register can exist. #2 is + * since the pipeline register will be destroyed at the end. This + * ensures that nothing will try to read/write the pipeline register + * once it is not live, and that there's no need to go earlier. */ - unsigned node = ins->dest; - unsigned read_mask = 0; + unsigned node = ins->dest; + unsigned read_mask = 0; - if (node >= SSA_FIXED_MINIMUM) - return false; + if (node >= SSA_FIXED_MINIMUM) + return false; - if (node == ctx->blend_src1) - return false; + if (node == ctx->blend_src1) + return false; - /* Analyze the bundle for a per-byte read mask */ + /* Analyze the bundle for a per-byte read mask */ - for (unsigned j = 0; j < bundle->instruction_count; ++j) { - midgard_instruction *q = bundle->instructions[j]; + for (unsigned j = 0; j < bundle->instruction_count; ++j) { + midgard_instruction *q = bundle->instructions[j]; - /* The fragment colour can't be pipelined (well, it is - * pipelined in r0, but this is a delicate dance with - * scheduling and RA, not for us to worry about) */ + /* The fragment colour can't be pipelined (well, it is + * pipelined in r0, but this is a delicate dance with + * scheduling and RA, not for us to worry about) */ - if (q->compact_branch && q->writeout && mir_has_arg(q, node)) - return false; + if (q->compact_branch && q->writeout && mir_has_arg(q, node)) + return false; - if (q->unit < UNIT_VADD) continue; - read_mask |= mir_bytemask_of_read_components(q, node); - } + if (q->unit < UNIT_VADD) + continue; + read_mask |= mir_bytemask_of_read_components(q, node); + } - /* Now check what's written in the beginning stage */ - for (unsigned j = 0; j < bundle->instruction_count; ++j) { - midgard_instruction *q = bundle->instructions[j]; - if (q->unit >= UNIT_VADD) break; - if (q->dest != node) continue; + /* Now check what's written in the beginning stage */ + for (unsigned j = 0; j < bundle->instruction_count; ++j) { + midgard_instruction *q = bundle->instructions[j]; + if (q->unit >= UNIT_VADD) + break; + if (q->dest != node) + continue; - /* Remove the written mask from the read requirements */ - read_mask &= ~mir_bytemask(q); - } + /* Remove the written mask from the read requirements */ + read_mask &= ~mir_bytemask(q); + } - /* Check for leftovers */ - if (read_mask) - return false; + /* Check for leftovers */ + if (read_mask) + return false; - /* We want to know if we live after this bundle, so check if - * we're live after the last instruction of the bundle */ + /* We want to know if we live after this bundle, so check if + * we're live after the last instruction of the bundle */ - midgard_instruction *end = bundle->instructions[ - bundle->instruction_count - 1]; + midgard_instruction *end = + bundle->instructions[bundle->instruction_count - 1]; - if (mir_is_live_after(ctx, block, end, ins->dest)) - return false; + if (mir_is_live_after(ctx, block, end, ins->dest)) + return false; - /* We're only live in this bundle -- pipeline! */ - unsigned preg = SSA_FIXED_REGISTER(24 + pipeline_count); + /* We're only live in this bundle -- pipeline! */ + unsigned preg = SSA_FIXED_REGISTER(24 + pipeline_count); - for (unsigned j = 0; j < bundle->instruction_count; ++j) { - midgard_instruction *q = bundle->instructions[j]; + for (unsigned j = 0; j < bundle->instruction_count; ++j) { + midgard_instruction *q = bundle->instructions[j]; - if (q->unit >= UNIT_VADD) - mir_rewrite_index_src_single(q, node, preg); - else - mir_rewrite_index_dst_single(q, node, preg); - } + if (q->unit >= UNIT_VADD) + mir_rewrite_index_src_single(q, node, preg); + else + mir_rewrite_index_dst_single(q, node, preg); + } - return true; + return true; } void mir_create_pipeline_registers(compiler_context *ctx) { - mir_invalidate_liveness(ctx); + mir_invalidate_liveness(ctx); - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; - mir_foreach_bundle_in_block(block, bundle) { - if (!mir_is_alu_bundle(bundle)) continue; - if (bundle->instruction_count < 2) continue; + mir_foreach_bundle_in_block(block, bundle) { + if (!mir_is_alu_bundle(bundle)) + continue; + if (bundle->instruction_count < 2) + continue; - /* Only first 2 instructions could pipeline */ - bool succ = mir_pipeline_ins(ctx, block, bundle, 0, 0); - mir_pipeline_ins(ctx, block, bundle, 1, succ); - } - } + /* Only first 2 instructions could pipeline */ + bool succ = mir_pipeline_ins(ctx, block, bundle, 0, 0); + mir_pipeline_ins(ctx, block, bundle, 1, succ); + } + } } diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c index 078c30fb54d..ad10fbbc94d 100644 --- a/src/panfrost/midgard/midgard_schedule.c +++ b/src/panfrost/midgard/midgard_schedule.c @@ -22,12 +22,12 @@ * SOFTWARE. */ +#include "util/half_float.h" +#include "util/u_math.h" +#include "util/u_memory.h" #include "compiler.h" #include "midgard_ops.h" #include "midgard_quirks.h" -#include "util/u_memory.h" -#include "util/u_math.h" -#include "util/half_float.h" /* Scheduling for Midgard is complicated, to say the least. ALU instructions * must be grouped into VLIW bundles according to following model: @@ -63,148 +63,159 @@ #define BYTE_COUNT 16 static void -add_dependency(struct util_dynarray *table, unsigned index, uint16_t mask, midgard_instruction **instructions, unsigned child) +add_dependency(struct util_dynarray *table, unsigned index, uint16_t mask, + midgard_instruction **instructions, unsigned child) { - for (unsigned i = 0; i < BYTE_COUNT; ++i) { - if (!(mask & (1 << i))) - continue; + for (unsigned i = 0; i < BYTE_COUNT; ++i) { + if (!(mask & (1 << i))) + continue; - struct util_dynarray *parents = &table[(BYTE_COUNT * index) + i]; + struct util_dynarray *parents = &table[(BYTE_COUNT * index) + i]; - util_dynarray_foreach(parents, unsigned, parent) { - BITSET_WORD *dependents = instructions[*parent]->dependents; + util_dynarray_foreach(parents, unsigned, parent) { + BITSET_WORD *dependents = instructions[*parent]->dependents; - /* Already have the dependency */ - if (BITSET_TEST(dependents, child)) - continue; + /* Already have the dependency */ + if (BITSET_TEST(dependents, child)) + continue; - BITSET_SET(dependents, child); - instructions[child]->nr_dependencies++; - } - } + BITSET_SET(dependents, child); + instructions[child]->nr_dependencies++; + } + } } static void -mark_access(struct util_dynarray *table, unsigned index, uint16_t mask, unsigned parent) +mark_access(struct util_dynarray *table, unsigned index, uint16_t mask, + unsigned parent) { - for (unsigned i = 0; i < BYTE_COUNT; ++i) { - if (!(mask & (1 << i))) - continue; + for (unsigned i = 0; i < BYTE_COUNT; ++i) { + if (!(mask & (1 << i))) + continue; - util_dynarray_append(&table[(BYTE_COUNT * index) + i], unsigned, parent); - } + util_dynarray_append(&table[(BYTE_COUNT * index) + i], unsigned, parent); + } } static void -mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count) +mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, + unsigned node_count) { - size_t sz = node_count * BYTE_COUNT; + size_t sz = node_count * BYTE_COUNT; - struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz); - struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz); + struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz); + struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz); - for (unsigned i = 0; i < sz; ++i) { - util_dynarray_init(&last_read[i], NULL); - util_dynarray_init(&last_write[i], NULL); - } + for (unsigned i = 0; i < sz; ++i) { + util_dynarray_init(&last_read[i], NULL); + util_dynarray_init(&last_write[i], NULL); + } - /* Initialize dependency graph */ - for (unsigned i = 0; i < count; ++i) { - instructions[i]->dependents = - calloc(BITSET_WORDS(count), sizeof(BITSET_WORD)); + /* Initialize dependency graph */ + for (unsigned i = 0; i < count; ++i) { + instructions[i]->dependents = + calloc(BITSET_WORDS(count), sizeof(BITSET_WORD)); - instructions[i]->nr_dependencies = 0; - } + instructions[i]->nr_dependencies = 0; + } - unsigned prev_ldst[3] = {~0, ~0, ~0}; + unsigned prev_ldst[3] = {~0, ~0, ~0}; - /* Populate dependency graph */ - for (signed i = count - 1; i >= 0; --i) { - if (instructions[i]->compact_branch) - continue; + /* Populate dependency graph */ + for (signed i = count - 1; i >= 0; --i) { + if (instructions[i]->compact_branch) + continue; - unsigned dest = instructions[i]->dest; - unsigned mask = mir_bytemask(instructions[i]); + unsigned dest = instructions[i]->dest; + unsigned mask = mir_bytemask(instructions[i]); - mir_foreach_src((*instructions), s) { - unsigned src = instructions[i]->src[s]; + mir_foreach_src((*instructions), s) { + unsigned src = instructions[i]->src[s]; - if (src < node_count) { - unsigned readmask = mir_bytemask_of_read_components(instructions[i], src); - add_dependency(last_write, src, readmask, instructions, i); - } - } + if (src < node_count) { + unsigned readmask = + mir_bytemask_of_read_components(instructions[i], src); + add_dependency(last_write, src, readmask, instructions, i); + } + } - /* Create a list of dependencies for each type of load/store - * instruction to prevent reordering. */ - if (instructions[i]->type == TAG_LOAD_STORE_4 && - load_store_opcode_props[instructions[i]->op].props & LDST_ADDRESS) { + /* Create a list of dependencies for each type of load/store + * instruction to prevent reordering. */ + if (instructions[i]->type == TAG_LOAD_STORE_4 && + load_store_opcode_props[instructions[i]->op].props & LDST_ADDRESS) { - unsigned type = instructions[i]->load_store.arg_reg | - instructions[i]->load_store.arg_comp; + unsigned type = instructions[i]->load_store.arg_reg | + instructions[i]->load_store.arg_comp; - unsigned idx; - switch (type) { - case LDST_SHARED: idx = 0; break; - case LDST_SCRATCH: idx = 1; break; - default: idx = 2; break; - } + unsigned idx; + switch (type) { + case LDST_SHARED: + idx = 0; + break; + case LDST_SCRATCH: + idx = 1; + break; + default: + idx = 2; + break; + } - unsigned prev = prev_ldst[idx]; + unsigned prev = prev_ldst[idx]; - if (prev != ~0) { - BITSET_WORD *dependents = instructions[prev]->dependents; + if (prev != ~0) { + BITSET_WORD *dependents = instructions[prev]->dependents; - /* Already have the dependency */ - if (BITSET_TEST(dependents, i)) - continue; + /* Already have the dependency */ + if (BITSET_TEST(dependents, i)) + continue; - BITSET_SET(dependents, i); - instructions[i]->nr_dependencies++; - } + BITSET_SET(dependents, i); + instructions[i]->nr_dependencies++; + } - prev_ldst[idx] = i; - } + prev_ldst[idx] = i; + } - if (dest < node_count) { - add_dependency(last_read, dest, mask, instructions, i); - add_dependency(last_write, dest, mask, instructions, i); - mark_access(last_write, dest, mask, i); - } + if (dest < node_count) { + add_dependency(last_read, dest, mask, instructions, i); + add_dependency(last_write, dest, mask, instructions, i); + mark_access(last_write, dest, mask, i); + } - mir_foreach_src((*instructions), s) { - unsigned src = instructions[i]->src[s]; + mir_foreach_src((*instructions), s) { + unsigned src = instructions[i]->src[s]; - if (src < node_count) { - unsigned readmask = mir_bytemask_of_read_components(instructions[i], src); - mark_access(last_read, src, readmask, i); - } - } - } + if (src < node_count) { + unsigned readmask = + mir_bytemask_of_read_components(instructions[i], src); + mark_access(last_read, src, readmask, i); + } + } + } - /* If there is a branch, all instructions depend on it, as interblock - * execution must be purely in-order */ + /* If there is a branch, all instructions depend on it, as interblock + * execution must be purely in-order */ - if (instructions[count - 1]->compact_branch) { - BITSET_WORD *dependents = instructions[count - 1]->dependents; + if (instructions[count - 1]->compact_branch) { + BITSET_WORD *dependents = instructions[count - 1]->dependents; - for (signed i = count - 2; i >= 0; --i) { - if (BITSET_TEST(dependents, i)) - continue; + for (signed i = count - 2; i >= 0; --i) { + if (BITSET_TEST(dependents, i)) + continue; - BITSET_SET(dependents, i); - instructions[i]->nr_dependencies++; - } - } + BITSET_SET(dependents, i); + instructions[i]->nr_dependencies++; + } + } - /* Free the intermediate structures */ - for (unsigned i = 0; i < sz; ++i) { - util_dynarray_fini(&last_read[i]); - util_dynarray_fini(&last_write[i]); - } + /* Free the intermediate structures */ + for (unsigned i = 0; i < sz; ++i) { + util_dynarray_fini(&last_read[i]); + util_dynarray_fini(&last_write[i]); + } - free(last_read); - free(last_write); + free(last_read); + free(last_write); } /* Does the mask cover more than a scalar? */ @@ -212,14 +223,14 @@ mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, static bool is_single_component_mask(unsigned mask) { - int components = 0; + int components = 0; - for (int c = 0; c < 8; ++c) { - if (mask & (1 << c)) - components++; - } + for (int c = 0; c < 8; ++c) { + if (mask & (1 << c)) + components++; + } - return components == 1; + return components == 1; } /* Helpers for scheudling */ @@ -227,29 +238,30 @@ is_single_component_mask(unsigned mask) static bool mir_is_scalar(midgard_instruction *ains) { - /* Do we try to use it as a vector op? */ - if (!is_single_component_mask(ains->mask)) - return false; + /* Do we try to use it as a vector op? */ + if (!is_single_component_mask(ains->mask)) + return false; - /* Otherwise, check mode hazards */ - bool could_scalar = true; - unsigned szd = nir_alu_type_get_type_size(ains->dest_type); - unsigned sz0 = nir_alu_type_get_type_size(ains->src_types[0]); - unsigned sz1 = nir_alu_type_get_type_size(ains->src_types[1]); + /* Otherwise, check mode hazards */ + bool could_scalar = true; + unsigned szd = nir_alu_type_get_type_size(ains->dest_type); + unsigned sz0 = nir_alu_type_get_type_size(ains->src_types[0]); + unsigned sz1 = nir_alu_type_get_type_size(ains->src_types[1]); - /* Only 16/32-bit can run on a scalar unit */ - could_scalar &= (szd == 16) || (szd == 32); + /* Only 16/32-bit can run on a scalar unit */ + could_scalar &= (szd == 16) || (szd == 32); - if (ains->src[0] != ~0) - could_scalar &= (sz0 == 16) || (sz0 == 32); + if (ains->src[0] != ~0) + could_scalar &= (sz0 == 16) || (sz0 == 32); - if (ains->src[1] != ~0) - could_scalar &= (sz1 == 16) || (sz1 == 32); + if (ains->src[1] != ~0) + could_scalar &= (sz1 == 16) || (sz1 == 32); - if (midgard_is_integer_out_op(ains->op) && ains->outmod != midgard_outmod_keeplo) - return false; + if (midgard_is_integer_out_op(ains->op) && + ains->outmod != midgard_outmod_keeplo) + return false; - return could_scalar; + return could_scalar; } /* How many bytes does this ALU instruction add to the bundle? */ @@ -257,14 +269,14 @@ mir_is_scalar(midgard_instruction *ains) static unsigned bytes_for_instruction(midgard_instruction *ains) { - if (ains->unit & UNITS_ANY_VECTOR) - return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu); - else if (ains->unit == ALU_ENAB_BRANCH) - return sizeof(midgard_branch_extended); - else if (ains->compact_branch) - return sizeof(uint16_t); - else - return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu); + if (ains->unit & UNITS_ANY_VECTOR) + return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu); + else if (ains->unit == ALU_ENAB_BRANCH) + return sizeof(midgard_branch_extended); + else if (ains->compact_branch) + return sizeof(uint16_t); + else + return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu); } /* We would like to flatten the linked list of midgard_instructions in a bundle @@ -273,32 +285,33 @@ bytes_for_instruction(midgard_instruction *ains) static midgard_instruction ** flatten_mir(midgard_block *block, unsigned *len) { - *len = list_length(&block->base.instructions); + *len = list_length(&block->base.instructions); - if (!(*len)) - return NULL; + if (!(*len)) + return NULL; - midgard_instruction **instructions = - calloc(sizeof(midgard_instruction *), *len); + midgard_instruction **instructions = + calloc(sizeof(midgard_instruction *), *len); - unsigned i = 0; + unsigned i = 0; - mir_foreach_instr_in_block(block, ins) - instructions[i++] = ins; + mir_foreach_instr_in_block(block, ins) + instructions[i++] = ins; - return instructions; + return instructions; } /* The worklist is the set of instructions that can be scheduled now; that is, * the set of instructions with no remaining dependencies */ static void -mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instructions, unsigned count) +mir_initialize_worklist(BITSET_WORD *worklist, + midgard_instruction **instructions, unsigned count) { - for (unsigned i = 0; i < count; ++i) { - if (instructions[i]->nr_dependencies == 0) - BITSET_SET(worklist, i); - } + for (unsigned i = 0; i < count; ++i) { + if (instructions[i]->nr_dependencies == 0) + BITSET_SET(worklist, i); + } } /* Update the worklist after an instruction terminates. Remove its edges from @@ -306,37 +319,37 @@ mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instruction * worklist */ static void -mir_update_worklist( - BITSET_WORD *worklist, unsigned count, - midgard_instruction **instructions, midgard_instruction *done) +mir_update_worklist(BITSET_WORD *worklist, unsigned count, + midgard_instruction **instructions, + midgard_instruction *done) { - /* Sanity check: if no instruction terminated, there is nothing to do. - * If the instruction that terminated had dependencies, that makes no - * sense and means we messed up the worklist. Finally, as the purpose - * of this routine is to update dependents, we abort early if there are - * no dependents defined. */ + /* Sanity check: if no instruction terminated, there is nothing to do. + * If the instruction that terminated had dependencies, that makes no + * sense and means we messed up the worklist. Finally, as the purpose + * of this routine is to update dependents, we abort early if there are + * no dependents defined. */ - if (!done) - return; + if (!done) + return; - assert(done->nr_dependencies == 0); + assert(done->nr_dependencies == 0); - if (!done->dependents) - return; + if (!done->dependents) + return; - /* We have an instruction with dependents. Iterate each dependent to - * remove one dependency (`done`), adding dependents to the worklist - * where possible. */ + /* We have an instruction with dependents. Iterate each dependent to + * remove one dependency (`done`), adding dependents to the worklist + * where possible. */ - unsigned i; - BITSET_FOREACH_SET(i, done->dependents, count) { - assert(instructions[i]->nr_dependencies); + unsigned i; + BITSET_FOREACH_SET(i, done->dependents, count) { + assert(instructions[i]->nr_dependencies); - if (!(--instructions[i]->nr_dependencies)) - BITSET_SET(worklist, i); - } + if (!(--instructions[i]->nr_dependencies)) + BITSET_SET(worklist, i); + } - free(done->dependents); + free(done->dependents); } /* While scheduling, we need to choose instructions satisfying certain @@ -345,184 +358,181 @@ mir_update_worklist( * given predicate. */ struct midgard_predicate { - /* TAG or ~0 for dont-care */ - unsigned tag; + /* TAG or ~0 for dont-care */ + unsigned tag; - /* True if we want to pop off the chosen instruction */ - bool destructive; + /* True if we want to pop off the chosen instruction */ + bool destructive; - /* For ALU, choose only this unit */ - unsigned unit; + /* For ALU, choose only this unit */ + unsigned unit; - /* State for bundle constants. constants is the actual constants - * for the bundle. constant_count is the number of bytes (up to - * 16) currently in use for constants. When picking in destructive - * mode, the constants array will be updated, and the instruction - * will be adjusted to index into the constants array */ + /* State for bundle constants. constants is the actual constants + * for the bundle. constant_count is the number of bytes (up to + * 16) currently in use for constants. When picking in destructive + * mode, the constants array will be updated, and the instruction + * will be adjusted to index into the constants array */ - midgard_constants *constants; - unsigned constant_mask; + midgard_constants *constants; + unsigned constant_mask; - /* Exclude this destination (if not ~0) */ - unsigned exclude; + /* Exclude this destination (if not ~0) */ + unsigned exclude; - /* Don't schedule instructions consuming conditionals (since we already - * scheduled one). Excludes conditional branches and csel */ - bool no_cond; + /* Don't schedule instructions consuming conditionals (since we already + * scheduled one). Excludes conditional branches and csel */ + bool no_cond; - /* Require (or reject) a minimal mask and (if nonzero) given - * destination. Used for writeout optimizations */ + /* Require (or reject) a minimal mask and (if nonzero) given + * destination. Used for writeout optimizations */ - unsigned mask; - unsigned no_mask; - unsigned dest; + unsigned mask; + unsigned no_mask; + unsigned dest; - /* Whether to not-care/only/never schedule imov/fmov instructions This - * allows non-move instructions to get priority on each unit */ - unsigned move_mode; + /* Whether to not-care/only/never schedule imov/fmov instructions This + * allows non-move instructions to get priority on each unit */ + unsigned move_mode; - /* For load/store: how many pipeline registers are in use? The two - * scheduled instructions cannot use more than the 256-bits of pipeline - * space available or RA will fail (as it would run out of pipeline - * registers and fail to spill without breaking the schedule) */ + /* For load/store: how many pipeline registers are in use? The two + * scheduled instructions cannot use more than the 256-bits of pipeline + * space available or RA will fail (as it would run out of pipeline + * registers and fail to spill without breaking the schedule) */ - unsigned pipeline_count; + unsigned pipeline_count; - /* For load/store: is a ST_VARY.a32 instruction scheduled into the - * bundle? is a non-ST_VARY.a32 instruction scheduled? Potential - * hardware issue, unknown cause. - */ - bool any_st_vary_a32, any_non_st_vary_a32; + /* For load/store: is a ST_VARY.a32 instruction scheduled into the + * bundle? is a non-ST_VARY.a32 instruction scheduled? Potential + * hardware issue, unknown cause. + */ + bool any_st_vary_a32, any_non_st_vary_a32; }; static bool mir_adjust_constant(midgard_instruction *ins, unsigned src, - unsigned *bundle_constant_mask, - unsigned *comp_mapping, - uint8_t *bundle_constants, - bool upper) + unsigned *bundle_constant_mask, unsigned *comp_mapping, + uint8_t *bundle_constants, bool upper) { - unsigned type_size = nir_alu_type_get_type_size(ins->src_types[src]) / 8; - unsigned type_shift = util_logbase2(type_size); - unsigned max_comp = mir_components_for_type(ins->src_types[src]); - unsigned comp_mask = mir_from_bytemask(mir_round_bytemask_up( - mir_bytemask_of_read_components_index(ins, src), - type_size * 8), - type_size * 8); - unsigned type_mask = (1 << type_size) - 1; + unsigned type_size = nir_alu_type_get_type_size(ins->src_types[src]) / 8; + unsigned type_shift = util_logbase2(type_size); + unsigned max_comp = mir_components_for_type(ins->src_types[src]); + unsigned comp_mask = mir_from_bytemask( + mir_round_bytemask_up(mir_bytemask_of_read_components_index(ins, src), + type_size * 8), + type_size * 8); + unsigned type_mask = (1 << type_size) - 1; - /* Upper only makes sense for 16-bit */ - if (type_size != 16 && upper) - return false; + /* Upper only makes sense for 16-bit */ + if (type_size != 16 && upper) + return false; - /* For 16-bit, we need to stay on either upper or lower halves to avoid - * disrupting the swizzle */ - unsigned start = upper ? 8 : 0; - unsigned length = (type_size == 2) ? 8 : 16; + /* For 16-bit, we need to stay on either upper or lower halves to avoid + * disrupting the swizzle */ + unsigned start = upper ? 8 : 0; + unsigned length = (type_size == 2) ? 8 : 16; - for (unsigned comp = 0; comp < max_comp; comp++) { - if (!(comp_mask & (1 << comp))) - continue; + for (unsigned comp = 0; comp < max_comp; comp++) { + if (!(comp_mask & (1 << comp))) + continue; - uint8_t *constantp = ins->constants.u8 + (type_size * comp); - unsigned best_reuse_bytes = 0; - signed best_place = -1; - unsigned i, j; + uint8_t *constantp = ins->constants.u8 + (type_size * comp); + unsigned best_reuse_bytes = 0; + signed best_place = -1; + unsigned i, j; - for (i = start; i < (start + length); i += type_size) { - unsigned reuse_bytes = 0; + for (i = start; i < (start + length); i += type_size) { + unsigned reuse_bytes = 0; - for (j = 0; j < type_size; j++) { - if (!(*bundle_constant_mask & (1 << (i + j)))) - continue; - if (constantp[j] != bundle_constants[i + j]) - break; - if ((i + j) > (start + length)) - break; + for (j = 0; j < type_size; j++) { + if (!(*bundle_constant_mask & (1 << (i + j)))) + continue; + if (constantp[j] != bundle_constants[i + j]) + break; + if ((i + j) > (start + length)) + break; - reuse_bytes++; - } + reuse_bytes++; + } - /* Select the place where existing bytes can be - * reused so we leave empty slots to others - */ - if (j == type_size && - (reuse_bytes > best_reuse_bytes || best_place < 0)) { - best_reuse_bytes = reuse_bytes; - best_place = i; - break; - } - } + /* Select the place where existing bytes can be + * reused so we leave empty slots to others + */ + if (j == type_size && + (reuse_bytes > best_reuse_bytes || best_place < 0)) { + best_reuse_bytes = reuse_bytes; + best_place = i; + break; + } + } - /* This component couldn't fit in the remaining constant slot, - * no need check the remaining components, bail out now - */ - if (best_place < 0) - return false; + /* This component couldn't fit in the remaining constant slot, + * no need check the remaining components, bail out now + */ + if (best_place < 0) + return false; - memcpy(&bundle_constants[i], constantp, type_size); - *bundle_constant_mask |= type_mask << best_place; - comp_mapping[comp] = best_place >> type_shift; - } + memcpy(&bundle_constants[i], constantp, type_size); + *bundle_constant_mask |= type_mask << best_place; + comp_mapping[comp] = best_place >> type_shift; + } - return true; + return true; } /* For an instruction that can fit, adjust it to fit and update the constants * array, in destructive mode. Returns whether the fitting was successful. */ static bool -mir_adjust_constants(midgard_instruction *ins, - struct midgard_predicate *pred, - bool destructive) +mir_adjust_constants(midgard_instruction *ins, struct midgard_predicate *pred, + bool destructive) { - /* No constant, nothing to adjust */ - if (!ins->has_constants) - return true; + /* No constant, nothing to adjust */ + if (!ins->has_constants) + return true; - unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); - unsigned bundle_constant_mask = pred->constant_mask; - unsigned comp_mapping[2][16] = { }; - uint8_t bundle_constants[16]; + unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + unsigned bundle_constant_mask = pred->constant_mask; + unsigned comp_mapping[2][16] = {}; + uint8_t bundle_constants[16]; - memcpy(bundle_constants, pred->constants, 16); + memcpy(bundle_constants, pred->constants, 16); - /* Let's try to find a place for each active component of the constant - * register. - */ - for (unsigned src = 0; src < 2; ++src) { - if (ins->src[src] != SSA_FIXED_REGISTER(REGISTER_CONSTANT)) - continue; + /* Let's try to find a place for each active component of the constant + * register. + */ + for (unsigned src = 0; src < 2; ++src) { + if (ins->src[src] != SSA_FIXED_REGISTER(REGISTER_CONSTANT)) + continue; - /* First, try lower half (or whole for !16) */ - if (mir_adjust_constant(ins, src, &bundle_constant_mask, - comp_mapping[src], bundle_constants, false)) - continue; + /* First, try lower half (or whole for !16) */ + if (mir_adjust_constant(ins, src, &bundle_constant_mask, + comp_mapping[src], bundle_constants, false)) + continue; - /* Next, try upper half */ - if (mir_adjust_constant(ins, src, &bundle_constant_mask, - comp_mapping[src], bundle_constants, true)) - continue; + /* Next, try upper half */ + if (mir_adjust_constant(ins, src, &bundle_constant_mask, + comp_mapping[src], bundle_constants, true)) + continue; - /* Otherwise bail */ - return false; - } + /* Otherwise bail */ + return false; + } - /* If non-destructive, we're done */ - if (!destructive) - return true; + /* If non-destructive, we're done */ + if (!destructive) + return true; - /* Otherwise update the constant_mask and constant values */ - pred->constant_mask = bundle_constant_mask; - memcpy(pred->constants, bundle_constants, 16); + /* Otherwise update the constant_mask and constant values */ + pred->constant_mask = bundle_constant_mask; + memcpy(pred->constants, bundle_constants, 16); - /* Use comp_mapping as a swizzle */ - mir_foreach_src(ins, s) { - if (ins->src[s] == r_constant) - mir_compose_swizzle(ins->swizzle[s], comp_mapping[s], ins->swizzle[s]); - } + /* Use comp_mapping as a swizzle */ + mir_foreach_src(ins, s) { + if (ins->src[s] == r_constant) + mir_compose_swizzle(ins->swizzle[s], comp_mapping[s], ins->swizzle[s]); + } - return true; + return true; } /* Conservative estimate of the pipeline registers required for load/store */ @@ -530,27 +540,28 @@ mir_adjust_constants(midgard_instruction *ins, static unsigned mir_pipeline_count(midgard_instruction *ins) { - unsigned bytecount = 0; + unsigned bytecount = 0; - mir_foreach_src(ins, i) { - /* Skip empty source */ - if (ins->src[i] == ~0) continue; + mir_foreach_src(ins, i) { + /* Skip empty source */ + if (ins->src[i] == ~0) + continue; - if (i == 0) { - /* First source is a vector, worst-case the mask */ - unsigned bytemask = mir_bytemask_of_read_components_index(ins, i); - unsigned max = util_logbase2(bytemask) + 1; - bytecount += max; - } else { - /* Sources 1 on are scalars */ - bytecount += 4; - } - } + if (i == 0) { + /* First source is a vector, worst-case the mask */ + unsigned bytemask = mir_bytemask_of_read_components_index(ins, i); + unsigned max = util_logbase2(bytemask) + 1; + bytecount += max; + } else { + /* Sources 1 on are scalars */ + bytecount += 4; + } + } - unsigned dwords = DIV_ROUND_UP(bytecount, 16); - assert(dwords <= 2); + unsigned dwords = DIV_ROUND_UP(bytecount, 16); + assert(dwords <= 2); - return dwords; + return dwords; } /* Matches FADD x, x with modifiers compatible. Since x + x = x * 2, for @@ -559,56 +570,56 @@ mir_pipeline_count(midgard_instruction *ins) static bool mir_is_add_2(midgard_instruction *ins) { - if (ins->op != midgard_alu_op_fadd) - return false; + if (ins->op != midgard_alu_op_fadd) + return false; - if (ins->src[0] != ins->src[1]) - return false; + if (ins->src[0] != ins->src[1]) + return false; - if (ins->src_types[0] != ins->src_types[1]) - return false; + if (ins->src_types[0] != ins->src_types[1]) + return false; - for (unsigned i = 0; i < MIR_VEC_COMPONENTS; ++i) { - if (ins->swizzle[0][i] != ins->swizzle[1][i]) - return false; - } + for (unsigned i = 0; i < MIR_VEC_COMPONENTS; ++i) { + if (ins->swizzle[0][i] != ins->swizzle[1][i]) + return false; + } - if (ins->src_abs[0] != ins->src_abs[1]) - return false; + if (ins->src_abs[0] != ins->src_abs[1]) + return false; - if (ins->src_neg[0] != ins->src_neg[1]) - return false; + if (ins->src_neg[0] != ins->src_neg[1]) + return false; - return true; + return true; } static void mir_adjust_unit(midgard_instruction *ins, unsigned unit) { - /* FADD x, x = FMUL x, #2 */ - if (mir_is_add_2(ins) && (unit & (UNITS_MUL | UNIT_VLUT))) { - ins->op = midgard_alu_op_fmul; + /* FADD x, x = FMUL x, #2 */ + if (mir_is_add_2(ins) && (unit & (UNITS_MUL | UNIT_VLUT))) { + ins->op = midgard_alu_op_fmul; - ins->src[1] = ~0; - ins->src_abs[1] = false; - ins->src_neg[1] = false; + ins->src[1] = ~0; + ins->src_abs[1] = false; + ins->src_neg[1] = false; - ins->has_inline_constant = true; - ins->inline_constant = _mesa_float_to_half(2.0); - } + ins->has_inline_constant = true; + ins->inline_constant = _mesa_float_to_half(2.0); + } } static unsigned mir_has_unit(midgard_instruction *ins, unsigned unit) { - if (alu_opcode_props[ins->op].props & unit) - return true; + if (alu_opcode_props[ins->op].props & unit) + return true; - /* FADD x, x can run on any adder or any multiplier */ - if (mir_is_add_2(ins)) - return true; + /* FADD x, x can run on any adder or any multiplier */ + if (mir_is_add_2(ins)) + return true; - return false; + return false; } /* Net change in liveness if an instruction were scheduled. Loosely based on @@ -617,265 +628,265 @@ mir_has_unit(midgard_instruction *ins, unsigned unit) static int mir_live_effect(uint16_t *liveness, midgard_instruction *ins, bool destructive) { - /* TODO: what if dest is used multiple times? */ - int free_live = 0; + /* TODO: what if dest is used multiple times? */ + int free_live = 0; - if (ins->dest < SSA_FIXED_MINIMUM) { - unsigned bytemask = mir_bytemask(ins); - bytemask = util_next_power_of_two(bytemask + 1) - 1; - free_live += util_bitcount(liveness[ins->dest] & bytemask); + if (ins->dest < SSA_FIXED_MINIMUM) { + unsigned bytemask = mir_bytemask(ins); + bytemask = util_next_power_of_two(bytemask + 1) - 1; + free_live += util_bitcount(liveness[ins->dest] & bytemask); - if (destructive) - liveness[ins->dest] &= ~bytemask; - } + if (destructive) + liveness[ins->dest] &= ~bytemask; + } - int new_live = 0; + int new_live = 0; - mir_foreach_src(ins, s) { - unsigned S = ins->src[s]; + mir_foreach_src(ins, s) { + unsigned S = ins->src[s]; - bool dupe = false; + bool dupe = false; - for (unsigned q = 0; q < s; ++q) - dupe |= (ins->src[q] == S); + for (unsigned q = 0; q < s; ++q) + dupe |= (ins->src[q] == S); - if (dupe) - continue; + if (dupe) + continue; - if (S < SSA_FIXED_MINIMUM) { - unsigned bytemask = mir_bytemask_of_read_components(ins, S); - bytemask = util_next_power_of_two(bytemask + 1) - 1; + if (S < SSA_FIXED_MINIMUM) { + unsigned bytemask = mir_bytemask_of_read_components(ins, S); + bytemask = util_next_power_of_two(bytemask + 1) - 1; - /* Count only the new components */ - new_live += util_bitcount(bytemask & ~(liveness[S])); + /* Count only the new components */ + new_live += util_bitcount(bytemask & ~(liveness[S])); - if (destructive) - liveness[S] |= bytemask; - } - } + if (destructive) + liveness[S] |= bytemask; + } + } - return new_live - free_live; + return new_live - free_live; } static midgard_instruction * -mir_choose_instruction( - midgard_instruction **instructions, - uint16_t *liveness, - BITSET_WORD *worklist, unsigned count, - struct midgard_predicate *predicate) +mir_choose_instruction(midgard_instruction **instructions, uint16_t *liveness, + BITSET_WORD *worklist, unsigned count, + struct midgard_predicate *predicate) { - /* Parse the predicate */ - unsigned tag = predicate->tag; - unsigned unit = predicate->unit; - bool scalar = (unit != ~0) && (unit & UNITS_SCALAR); - bool no_cond = predicate->no_cond; + /* Parse the predicate */ + unsigned tag = predicate->tag; + unsigned unit = predicate->unit; + bool scalar = (unit != ~0) && (unit & UNITS_SCALAR); + bool no_cond = predicate->no_cond; - unsigned mask = predicate->mask; - unsigned dest = predicate->dest; - bool needs_dest = mask & 0xF; + unsigned mask = predicate->mask; + unsigned dest = predicate->dest; + bool needs_dest = mask & 0xF; - /* Iterate to find the best instruction satisfying the predicate */ - unsigned i; + /* Iterate to find the best instruction satisfying the predicate */ + unsigned i; - signed best_index = -1; - signed best_effect = INT_MAX; - bool best_conditional = false; + signed best_index = -1; + signed best_effect = INT_MAX; + bool best_conditional = false; - /* Enforce a simple metric limiting distance to keep down register - * pressure. TOOD: replace with liveness tracking for much better - * results */ + /* Enforce a simple metric limiting distance to keep down register + * pressure. TOOD: replace with liveness tracking for much better + * results */ - unsigned max_active = 0; - unsigned max_distance = 36; + unsigned max_active = 0; + unsigned max_distance = 36; #ifndef NDEBUG - /* Force in-order scheduling */ - if (midgard_debug & MIDGARD_DBG_INORDER) - max_distance = 1; + /* Force in-order scheduling */ + if (midgard_debug & MIDGARD_DBG_INORDER) + max_distance = 1; #endif - BITSET_FOREACH_SET(i, worklist, count) { - max_active = MAX2(max_active, i); - } + BITSET_FOREACH_SET(i, worklist, count) { + max_active = MAX2(max_active, i); + } - BITSET_FOREACH_SET(i, worklist, count) { - if ((max_active - i) >= max_distance) - continue; + BITSET_FOREACH_SET(i, worklist, count) { + if ((max_active - i) >= max_distance) + continue; - if (tag != ~0 && instructions[i]->type != tag) - continue; + if (tag != ~0 && instructions[i]->type != tag) + continue; - bool alu = (instructions[i]->type == TAG_ALU_4); - bool ldst = (instructions[i]->type == TAG_LOAD_STORE_4); + bool alu = (instructions[i]->type == TAG_ALU_4); + bool ldst = (instructions[i]->type == TAG_LOAD_STORE_4); - bool branch = alu && (unit == ALU_ENAB_BR_COMPACT); - bool is_move = alu && - (instructions[i]->op == midgard_alu_op_imov || - instructions[i]->op == midgard_alu_op_fmov); + bool branch = alu && (unit == ALU_ENAB_BR_COMPACT); + bool is_move = alu && (instructions[i]->op == midgard_alu_op_imov || + instructions[i]->op == midgard_alu_op_fmov); - if (predicate->exclude != ~0 && instructions[i]->dest == predicate->exclude) - continue; + if (predicate->exclude != ~0 && + instructions[i]->dest == predicate->exclude) + continue; - if (alu && !branch && unit != ~0 && !(mir_has_unit(instructions[i], unit))) - continue; + if (alu && !branch && unit != ~0 && + !(mir_has_unit(instructions[i], unit))) + continue; - /* 0: don't care, 1: no moves, 2: only moves */ - if (predicate->move_mode && ((predicate->move_mode - 1) != is_move)) - continue; + /* 0: don't care, 1: no moves, 2: only moves */ + if (predicate->move_mode && ((predicate->move_mode - 1) != is_move)) + continue; - if (branch && !instructions[i]->compact_branch) - continue; + if (branch && !instructions[i]->compact_branch) + continue; - if (alu && scalar && !mir_is_scalar(instructions[i])) - continue; + if (alu && scalar && !mir_is_scalar(instructions[i])) + continue; - if (alu && predicate->constants && !mir_adjust_constants(instructions[i], predicate, false)) - continue; + if (alu && predicate->constants && + !mir_adjust_constants(instructions[i], predicate, false)) + continue; - if (needs_dest && instructions[i]->dest != dest) - continue; + if (needs_dest && instructions[i]->dest != dest) + continue; - if (mask && ((~instructions[i]->mask) & mask)) - continue; + if (mask && ((~instructions[i]->mask) & mask)) + continue; - if (instructions[i]->mask & predicate->no_mask) - continue; + if (instructions[i]->mask & predicate->no_mask) + continue; - if (ldst && mir_pipeline_count(instructions[i]) + predicate->pipeline_count > 2) - continue; + if (ldst && + mir_pipeline_count(instructions[i]) + predicate->pipeline_count > 2) + continue; - bool st_vary_a32 = (instructions[i]->op == midgard_op_st_vary_32); + bool st_vary_a32 = (instructions[i]->op == midgard_op_st_vary_32); - if (ldst && predicate->any_non_st_vary_a32 && st_vary_a32) - continue; + if (ldst && predicate->any_non_st_vary_a32 && st_vary_a32) + continue; - if (ldst && predicate->any_st_vary_a32 && !st_vary_a32) - continue; + if (ldst && predicate->any_st_vary_a32 && !st_vary_a32) + continue; - bool conditional = alu && !branch && OP_IS_CSEL(instructions[i]->op); - conditional |= (branch && instructions[i]->branch.conditional); + bool conditional = alu && !branch && OP_IS_CSEL(instructions[i]->op); + conditional |= (branch && instructions[i]->branch.conditional); - if (conditional && no_cond) - continue; + if (conditional && no_cond) + continue; - int effect = mir_live_effect(liveness, instructions[i], false); + int effect = mir_live_effect(liveness, instructions[i], false); - if (effect > best_effect) - continue; + if (effect > best_effect) + continue; - if (effect == best_effect && (signed) i < best_index) - continue; + if (effect == best_effect && (signed)i < best_index) + continue; - best_effect = effect; - best_index = i; - best_conditional = conditional; - } + best_effect = effect; + best_index = i; + best_conditional = conditional; + } - /* Did we find anything? */ + /* Did we find anything? */ - if (best_index < 0) - return NULL; + if (best_index < 0) + return NULL; - /* If we found something, remove it from the worklist */ - assert(best_index < count); - midgard_instruction *I = instructions[best_index]; + /* If we found something, remove it from the worklist */ + assert(best_index < count); + midgard_instruction *I = instructions[best_index]; - if (predicate->destructive) { - BITSET_CLEAR(worklist, best_index); + if (predicate->destructive) { + BITSET_CLEAR(worklist, best_index); - if (I->type == TAG_ALU_4) - mir_adjust_constants(instructions[best_index], predicate, true); + if (I->type == TAG_ALU_4) + mir_adjust_constants(instructions[best_index], predicate, true); - if (I->type == TAG_LOAD_STORE_4) { - predicate->pipeline_count += mir_pipeline_count(instructions[best_index]); + if (I->type == TAG_LOAD_STORE_4) { + predicate->pipeline_count += + mir_pipeline_count(instructions[best_index]); - if (instructions[best_index]->op == midgard_op_st_vary_32) - predicate->any_st_vary_a32 = true; - else - predicate->any_non_st_vary_a32 = true; - } + if (instructions[best_index]->op == midgard_op_st_vary_32) + predicate->any_st_vary_a32 = true; + else + predicate->any_non_st_vary_a32 = true; + } - if (I->type == TAG_ALU_4) - mir_adjust_unit(instructions[best_index], unit); + if (I->type == TAG_ALU_4) + mir_adjust_unit(instructions[best_index], unit); - /* Once we schedule a conditional, we can't again */ - predicate->no_cond |= best_conditional; - mir_live_effect(liveness, instructions[best_index], true); - } + /* Once we schedule a conditional, we can't again */ + predicate->no_cond |= best_conditional; + mir_live_effect(liveness, instructions[best_index], true); + } - return I; + return I; } /* Still, we don't choose instructions in a vacuum. We need a way to choose the * best bundle type (ALU, load/store, texture). Nondestructive. */ static unsigned -mir_choose_bundle( - midgard_instruction **instructions, - uint16_t *liveness, - BITSET_WORD *worklist, unsigned count, - unsigned num_ldst) +mir_choose_bundle(midgard_instruction **instructions, uint16_t *liveness, + BITSET_WORD *worklist, unsigned count, unsigned num_ldst) { - /* At the moment, our algorithm is very simple - use the bundle of the - * best instruction, regardless of what else could be scheduled - * alongside it. This is not optimal but it works okay for in-order */ + /* At the moment, our algorithm is very simple - use the bundle of the + * best instruction, regardless of what else could be scheduled + * alongside it. This is not optimal but it works okay for in-order */ - struct midgard_predicate predicate = { - .tag = ~0, - .unit = ~0, - .destructive = false, - .exclude = ~0, - }; + struct midgard_predicate predicate = { + .tag = ~0, + .unit = ~0, + .destructive = false, + .exclude = ~0, + }; - midgard_instruction *chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate); + midgard_instruction *chosen = mir_choose_instruction( + instructions, liveness, worklist, count, &predicate); - if (chosen && chosen->type == TAG_LOAD_STORE_4 && !(num_ldst % 2)) { - /* Try to schedule load/store ops in pairs */ + if (chosen && chosen->type == TAG_LOAD_STORE_4 && !(num_ldst % 2)) { + /* Try to schedule load/store ops in pairs */ - predicate.exclude = chosen->dest; - predicate.tag = TAG_LOAD_STORE_4; + predicate.exclude = chosen->dest; + predicate.tag = TAG_LOAD_STORE_4; - chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate); - if (chosen) - return TAG_LOAD_STORE_4; + chosen = mir_choose_instruction(instructions, liveness, worklist, count, + &predicate); + if (chosen) + return TAG_LOAD_STORE_4; - predicate.tag = ~0; + predicate.tag = ~0; - chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate); - assert(chosen == NULL || chosen->type != TAG_LOAD_STORE_4); + chosen = mir_choose_instruction(instructions, liveness, worklist, count, + &predicate); + assert(chosen == NULL || chosen->type != TAG_LOAD_STORE_4); - if (chosen) - return chosen->type; - else - return TAG_LOAD_STORE_4; - } + if (chosen) + return chosen->type; + else + return TAG_LOAD_STORE_4; + } - if (chosen) - return chosen->type; - else - return ~0; + if (chosen) + return chosen->type; + else + return ~0; } /* We want to choose an ALU instruction filling a given unit */ static void -mir_choose_alu(midgard_instruction **slot, - midgard_instruction **instructions, - uint16_t *liveness, - BITSET_WORD *worklist, unsigned len, - struct midgard_predicate *predicate, - unsigned unit) +mir_choose_alu(midgard_instruction **slot, midgard_instruction **instructions, + uint16_t *liveness, BITSET_WORD *worklist, unsigned len, + struct midgard_predicate *predicate, unsigned unit) { - /* Did we already schedule to this slot? */ - if ((*slot) != NULL) - return; + /* Did we already schedule to this slot? */ + if ((*slot) != NULL) + return; - /* Try to schedule something, if not */ - predicate->unit = unit; - *slot = mir_choose_instruction(instructions, liveness, worklist, len, predicate); + /* Try to schedule something, if not */ + predicate->unit = unit; + *slot = + mir_choose_instruction(instructions, liveness, worklist, len, predicate); - /* Store unit upon scheduling */ - if (*slot && !((*slot)->compact_branch)) - (*slot)->unit = unit; + /* Store unit upon scheduling */ + if (*slot && !((*slot)->compact_branch)) + (*slot)->unit = unit; } /* When we are scheduling a branch/csel, we need the consumed condition in the @@ -893,54 +904,51 @@ mir_choose_alu(midgard_instruction **slot, */ static unsigned -mir_comparison_mobile( - compiler_context *ctx, - midgard_instruction **instructions, - struct midgard_predicate *predicate, - unsigned count, - unsigned cond) +mir_comparison_mobile(compiler_context *ctx, midgard_instruction **instructions, + struct midgard_predicate *predicate, unsigned count, + unsigned cond) { - if (!mir_single_use(ctx, cond)) - return ~0; + if (!mir_single_use(ctx, cond)) + return ~0; - unsigned ret = ~0; + unsigned ret = ~0; - for (unsigned i = 0; i < count; ++i) { - if (instructions[i]->dest != cond) - continue; + for (unsigned i = 0; i < count; ++i) { + if (instructions[i]->dest != cond) + continue; - /* Must fit in an ALU bundle */ - if (instructions[i]->type != TAG_ALU_4) - return ~0; + /* Must fit in an ALU bundle */ + if (instructions[i]->type != TAG_ALU_4) + return ~0; - /* If it would itself require a condition, that's recursive */ - if (OP_IS_CSEL(instructions[i]->op)) - return ~0; + /* If it would itself require a condition, that's recursive */ + if (OP_IS_CSEL(instructions[i]->op)) + return ~0; - /* We'll need to rewrite to .w but that doesn't work for vector - * ops that don't replicate (ball/bany), so bail there */ + /* We'll need to rewrite to .w but that doesn't work for vector + * ops that don't replicate (ball/bany), so bail there */ - if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->op].props)) - return ~0; + if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->op].props)) + return ~0; - /* Ensure it will fit with constants */ + /* Ensure it will fit with constants */ - if (!mir_adjust_constants(instructions[i], predicate, false)) - return ~0; + if (!mir_adjust_constants(instructions[i], predicate, false)) + return ~0; - /* Ensure it is written only once */ + /* Ensure it is written only once */ - if (ret != ~0) - return ~0; - else - ret = i; - } + if (ret != ~0) + return ~0; + else + ret = i; + } - /* Inject constants now that we are sure we want to */ - if (ret != ~0) - mir_adjust_constants(instructions[ret], predicate, true); + /* Inject constants now that we are sure we want to */ + if (ret != ~0) + mir_adjust_constants(instructions[ret], predicate, true); - return ret; + return ret; } /* Using the information about the moveable conditional itself, we either pop @@ -948,33 +956,33 @@ mir_comparison_mobile( * artificially schedule instead as a fallback */ static midgard_instruction * -mir_schedule_comparison( - compiler_context *ctx, - midgard_instruction **instructions, - struct midgard_predicate *predicate, - BITSET_WORD *worklist, unsigned count, - unsigned cond, bool vector, unsigned *swizzle, - midgard_instruction *user) +mir_schedule_comparison(compiler_context *ctx, + midgard_instruction **instructions, + struct midgard_predicate *predicate, + BITSET_WORD *worklist, unsigned count, unsigned cond, + bool vector, unsigned *swizzle, + midgard_instruction *user) { - /* TODO: swizzle when scheduling */ - unsigned comp_i = - (!vector && (swizzle[0] == 0)) ? - mir_comparison_mobile(ctx, instructions, predicate, count, cond) : ~0; + /* TODO: swizzle when scheduling */ + unsigned comp_i = + (!vector && (swizzle[0] == 0)) + ? mir_comparison_mobile(ctx, instructions, predicate, count, cond) + : ~0; - /* If we can, schedule the condition immediately */ - if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) { - assert(comp_i < count); - BITSET_CLEAR(worklist, comp_i); - return instructions[comp_i]; - } + /* If we can, schedule the condition immediately */ + if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) { + assert(comp_i < count); + BITSET_CLEAR(worklist, comp_i); + return instructions[comp_i]; + } - /* Otherwise, we insert a move */ + /* Otherwise, we insert a move */ - midgard_instruction mov = v_mov(cond, cond); - mov.mask = vector ? 0xF : 0x1; - memcpy(mov.swizzle[1], swizzle, sizeof(mov.swizzle[1])); + midgard_instruction mov = v_mov(cond, cond); + mov.mask = vector ? 0xF : 0x1; + memcpy(mov.swizzle[1], swizzle, sizeof(mov.swizzle[1])); - return mir_insert_instruction_before(ctx, user, mov); + return mir_insert_instruction_before(ctx, user, mov); } /* Most generally, we need instructions writing to r31 in the appropriate @@ -982,625 +990,632 @@ mir_schedule_comparison( static midgard_instruction * mir_schedule_condition(compiler_context *ctx, - struct midgard_predicate *predicate, - BITSET_WORD *worklist, unsigned count, - midgard_instruction **instructions, - midgard_instruction *last) + struct midgard_predicate *predicate, + BITSET_WORD *worklist, unsigned count, + midgard_instruction **instructions, + midgard_instruction *last) { - /* For a branch, the condition is the only argument; for csel, third */ - bool branch = last->compact_branch; - unsigned condition_index = branch ? 0 : 2; + /* For a branch, the condition is the only argument; for csel, third */ + bool branch = last->compact_branch; + unsigned condition_index = branch ? 0 : 2; - /* csel_v is vector; otherwise, conditions are scalar */ - bool vector = !branch && OP_IS_CSEL_V(last->op); + /* csel_v is vector; otherwise, conditions are scalar */ + bool vector = !branch && OP_IS_CSEL_V(last->op); - /* Grab the conditional instruction */ + /* Grab the conditional instruction */ - midgard_instruction *cond = mir_schedule_comparison( - ctx, instructions, predicate, worklist, count, last->src[condition_index], - vector, last->swizzle[condition_index], last); + midgard_instruction *cond = mir_schedule_comparison( + ctx, instructions, predicate, worklist, count, last->src[condition_index], + vector, last->swizzle[condition_index], last); - /* We have exclusive reign over this (possibly move) conditional - * instruction. We can rewrite into a pipeline conditional register */ + /* We have exclusive reign over this (possibly move) conditional + * instruction. We can rewrite into a pipeline conditional register */ - predicate->exclude = cond->dest; - cond->dest = SSA_FIXED_REGISTER(31); + predicate->exclude = cond->dest; + cond->dest = SSA_FIXED_REGISTER(31); - if (!vector) { - cond->mask = (1 << COMPONENT_W); + if (!vector) { + cond->mask = (1 << COMPONENT_W); - mir_foreach_src(cond, s) { - if (cond->src[s] == ~0) - continue; + mir_foreach_src(cond, s) { + if (cond->src[s] == ~0) + continue; - for (unsigned q = 0; q < 4; ++q) - cond->swizzle[s][q + COMPONENT_W] = cond->swizzle[s][q]; - } - } + for (unsigned q = 0; q < 4; ++q) + cond->swizzle[s][q + COMPONENT_W] = cond->swizzle[s][q]; + } + } - /* Schedule the unit: csel is always in the latter pipeline, so a csel - * condition must be in the former pipeline stage (vmul/sadd), - * depending on scalar/vector of the instruction itself. A branch must - * be written from the latter pipeline stage and a branch condition is - * always scalar, so it is always in smul (exception: ball/bany, which - * will be vadd) */ + /* Schedule the unit: csel is always in the latter pipeline, so a csel + * condition must be in the former pipeline stage (vmul/sadd), + * depending on scalar/vector of the instruction itself. A branch must + * be written from the latter pipeline stage and a branch condition is + * always scalar, so it is always in smul (exception: ball/bany, which + * will be vadd) */ - if (branch) - cond->unit = UNIT_SMUL; - else - cond->unit = vector ? UNIT_VMUL : UNIT_SADD; + if (branch) + cond->unit = UNIT_SMUL; + else + cond->unit = vector ? UNIT_VMUL : UNIT_SADD; - return cond; + return cond; } /* Schedules a single bundle of the given type */ static midgard_bundle -mir_schedule_texture( - midgard_instruction **instructions, - uint16_t *liveness, - BITSET_WORD *worklist, unsigned len, - bool is_vertex) +mir_schedule_texture(midgard_instruction **instructions, uint16_t *liveness, + BITSET_WORD *worklist, unsigned len, bool is_vertex) { - struct midgard_predicate predicate = { - .tag = TAG_TEXTURE_4, - .destructive = true, - .exclude = ~0, - }; + struct midgard_predicate predicate = { + .tag = TAG_TEXTURE_4, + .destructive = true, + .exclude = ~0, + }; - midgard_instruction *ins = - mir_choose_instruction(instructions, liveness, worklist, len, &predicate); + midgard_instruction *ins = + mir_choose_instruction(instructions, liveness, worklist, len, &predicate); - mir_update_worklist(worklist, len, instructions, ins); + mir_update_worklist(worklist, len, instructions, ins); - struct midgard_bundle out = { - .tag = ins->op == midgard_tex_op_barrier ? - TAG_TEXTURE_4_BARRIER : - (ins->op == midgard_tex_op_fetch) || is_vertex ? - TAG_TEXTURE_4_VTX : TAG_TEXTURE_4, - .instruction_count = 1, - .instructions = { ins }, - }; + struct midgard_bundle out = { + .tag = ins->op == midgard_tex_op_barrier ? TAG_TEXTURE_4_BARRIER + : (ins->op == midgard_tex_op_fetch) || is_vertex + ? TAG_TEXTURE_4_VTX + : TAG_TEXTURE_4, + .instruction_count = 1, + .instructions = {ins}, + }; - return out; + return out; } static midgard_bundle -mir_schedule_ldst( - midgard_instruction **instructions, - uint16_t *liveness, - BITSET_WORD *worklist, unsigned len, - unsigned *num_ldst) +mir_schedule_ldst(midgard_instruction **instructions, uint16_t *liveness, + BITSET_WORD *worklist, unsigned len, unsigned *num_ldst) { - struct midgard_predicate predicate = { - .tag = TAG_LOAD_STORE_4, - .destructive = true, - .exclude = ~0, - }; + struct midgard_predicate predicate = { + .tag = TAG_LOAD_STORE_4, + .destructive = true, + .exclude = ~0, + }; - /* Try to pick two load/store ops. Second not gauranteed to exist */ + /* Try to pick two load/store ops. Second not gauranteed to exist */ - midgard_instruction *ins = - mir_choose_instruction(instructions, liveness, worklist, len, &predicate); + midgard_instruction *ins = + mir_choose_instruction(instructions, liveness, worklist, len, &predicate); - midgard_instruction *pair = - mir_choose_instruction(instructions, liveness, worklist, len, &predicate); + midgard_instruction *pair = + mir_choose_instruction(instructions, liveness, worklist, len, &predicate); - assert(ins != NULL); + assert(ins != NULL); - struct midgard_bundle out = { - .tag = TAG_LOAD_STORE_4, - .instruction_count = pair ? 2 : 1, - .instructions = { ins, pair }, - }; + struct midgard_bundle out = { + .tag = TAG_LOAD_STORE_4, + .instruction_count = pair ? 2 : 1, + .instructions = {ins, pair}, + }; - *num_ldst -= out.instruction_count; + *num_ldst -= out.instruction_count; - /* We have to update the worklist atomically, since the two - * instructions run concurrently (TODO: verify it's not pipelined) */ + /* We have to update the worklist atomically, since the two + * instructions run concurrently (TODO: verify it's not pipelined) */ - mir_update_worklist(worklist, len, instructions, ins); - mir_update_worklist(worklist, len, instructions, pair); + mir_update_worklist(worklist, len, instructions, ins); + mir_update_worklist(worklist, len, instructions, pair); - return out; + return out; } static void -mir_schedule_zs_write( - compiler_context *ctx, - struct midgard_predicate *predicate, - midgard_instruction **instructions, - uint16_t *liveness, - BITSET_WORD *worklist, unsigned len, - midgard_instruction *branch, - midgard_instruction **smul, - midgard_instruction **vadd, - midgard_instruction **vlut, - bool stencil) +mir_schedule_zs_write(compiler_context *ctx, + struct midgard_predicate *predicate, + midgard_instruction **instructions, uint16_t *liveness, + BITSET_WORD *worklist, unsigned len, + midgard_instruction *branch, midgard_instruction **smul, + midgard_instruction **vadd, midgard_instruction **vlut, + bool stencil) { - bool success = false; - unsigned idx = stencil ? 3 : 2; - unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(1) : branch->src[idx]; + bool success = false; + unsigned idx = stencil ? 3 : 2; + unsigned src = + (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(1) : branch->src[idx]; - predicate->dest = src; - predicate->mask = 0x1; + predicate->dest = src; + predicate->mask = 0x1; - midgard_instruction **units[] = { smul, vadd, vlut }; - unsigned unit_names[] = { UNIT_SMUL, UNIT_VADD, UNIT_VLUT }; + midgard_instruction **units[] = {smul, vadd, vlut}; + unsigned unit_names[] = {UNIT_SMUL, UNIT_VADD, UNIT_VLUT}; - for (unsigned i = 0; i < 3; ++i) { - if (*(units[i])) - continue; + for (unsigned i = 0; i < 3; ++i) { + if (*(units[i])) + continue; - predicate->unit = unit_names[i]; - midgard_instruction *ins = - mir_choose_instruction(instructions, liveness, worklist, len, predicate); + predicate->unit = unit_names[i]; + midgard_instruction *ins = mir_choose_instruction( + instructions, liveness, worklist, len, predicate); - if (ins) { - ins->unit = unit_names[i]; - *(units[i]) = ins; - success |= true; - break; - } - } + if (ins) { + ins->unit = unit_names[i]; + *(units[i]) = ins; + success |= true; + break; + } + } - predicate->dest = predicate->mask = 0; + predicate->dest = predicate->mask = 0; - if (success) - return; + if (success) + return; - midgard_instruction *mov = ralloc(ctx, midgard_instruction); - *mov = v_mov(src, make_compiler_temp(ctx)); - mov->mask = 0x1; + midgard_instruction *mov = ralloc(ctx, midgard_instruction); + *mov = v_mov(src, make_compiler_temp(ctx)); + mov->mask = 0x1; - branch->src[idx] = mov->dest; + branch->src[idx] = mov->dest; - if (stencil) { - unsigned swizzle = (branch->src[0] == ~0) ? COMPONENT_Y : COMPONENT_X; + if (stencil) { + unsigned swizzle = (branch->src[0] == ~0) ? COMPONENT_Y : COMPONENT_X; - for (unsigned c = 0; c < 16; ++c) - mov->swizzle[1][c] = swizzle; - } + for (unsigned c = 0; c < 16; ++c) + mov->swizzle[1][c] = swizzle; + } - for (unsigned i = 0; i < 3; ++i) { - if (!(*(units[i]))) { - *(units[i]) = mov; - mov->unit = unit_names[i]; - return; - } - } + for (unsigned i = 0; i < 3; ++i) { + if (!(*(units[i]))) { + *(units[i]) = mov; + mov->unit = unit_names[i]; + return; + } + } - unreachable("Could not schedule Z/S move to any unit"); + unreachable("Could not schedule Z/S move to any unit"); } static midgard_bundle -mir_schedule_alu( - compiler_context *ctx, - midgard_instruction **instructions, - uint16_t *liveness, - BITSET_WORD *worklist, unsigned len) +mir_schedule_alu(compiler_context *ctx, midgard_instruction **instructions, + uint16_t *liveness, BITSET_WORD *worklist, unsigned len) { - struct midgard_bundle bundle = {}; + struct midgard_bundle bundle = {}; - unsigned bytes_emitted = sizeof(bundle.control); + unsigned bytes_emitted = sizeof(bundle.control); - struct midgard_predicate predicate = { - .tag = TAG_ALU_4, - .destructive = true, - .exclude = ~0, - .constants = &bundle.constants, - }; + struct midgard_predicate predicate = { + .tag = TAG_ALU_4, + .destructive = true, + .exclude = ~0, + .constants = &bundle.constants, + }; - midgard_instruction *vmul = NULL; - midgard_instruction *vadd = NULL; - midgard_instruction *vlut = NULL; - midgard_instruction *smul = NULL; - midgard_instruction *sadd = NULL; - midgard_instruction *branch = NULL; + midgard_instruction *vmul = NULL; + midgard_instruction *vadd = NULL; + midgard_instruction *vlut = NULL; + midgard_instruction *smul = NULL; + midgard_instruction *sadd = NULL; + midgard_instruction *branch = NULL; - mir_choose_alu(&branch, instructions, liveness, worklist, len, &predicate, ALU_ENAB_BR_COMPACT); - mir_update_worklist(worklist, len, instructions, branch); - unsigned writeout = branch ? branch->writeout : 0; + mir_choose_alu(&branch, instructions, liveness, worklist, len, &predicate, + ALU_ENAB_BR_COMPACT); + mir_update_worklist(worklist, len, instructions, branch); + unsigned writeout = branch ? branch->writeout : 0; - if (branch && branch->branch.conditional) { - midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, branch); + if (branch && branch->branch.conditional) { + midgard_instruction *cond = mir_schedule_condition( + ctx, &predicate, worklist, len, instructions, branch); - if (cond->unit == UNIT_VADD) - vadd = cond; - else if (cond->unit == UNIT_SMUL) - smul = cond; - else - unreachable("Bad condition"); - } + if (cond->unit == UNIT_VADD) + vadd = cond; + else if (cond->unit == UNIT_SMUL) + smul = cond; + else + unreachable("Bad condition"); + } - /* If we have a render target reference, schedule a move for it. Since - * this will be in sadd, we boost this to prevent scheduling csel into - * smul */ + /* If we have a render target reference, schedule a move for it. Since + * this will be in sadd, we boost this to prevent scheduling csel into + * smul */ - if (writeout && (branch->constants.u32[0] || ctx->inputs->is_blend)) { - sadd = ralloc(ctx, midgard_instruction); - *sadd = v_mov(~0, make_compiler_temp(ctx)); - sadd->unit = UNIT_SADD; - sadd->mask = 0x1; - sadd->has_inline_constant = true; - sadd->inline_constant = branch->constants.u32[0]; - branch->src[1] = sadd->dest; - branch->src_types[1] = sadd->dest_type; - } + if (writeout && (branch->constants.u32[0] || ctx->inputs->is_blend)) { + sadd = ralloc(ctx, midgard_instruction); + *sadd = v_mov(~0, make_compiler_temp(ctx)); + sadd->unit = UNIT_SADD; + sadd->mask = 0x1; + sadd->has_inline_constant = true; + sadd->inline_constant = branch->constants.u32[0]; + branch->src[1] = sadd->dest; + branch->src_types[1] = sadd->dest_type; + } - if (writeout) { - /* Propagate up */ - bundle.last_writeout = branch->last_writeout; + if (writeout) { + /* Propagate up */ + bundle.last_writeout = branch->last_writeout; - /* Mask off any conditionals. - * This prevents csel and csel_v being scheduled into smul - * since we might not have room for a conditional in vmul/sadd. - * This is important because both writeout and csel have same-bundle - * requirements on their dependencies. */ - predicate.no_cond = true; - } + /* Mask off any conditionals. + * This prevents csel and csel_v being scheduled into smul + * since we might not have room for a conditional in vmul/sadd. + * This is important because both writeout and csel have same-bundle + * requirements on their dependencies. */ + predicate.no_cond = true; + } - /* Set r1.w to the return address so we can return from blend shaders */ - if (writeout) { - vadd = ralloc(ctx, midgard_instruction); - *vadd = v_mov(~0, make_compiler_temp(ctx)); + /* Set r1.w to the return address so we can return from blend shaders */ + if (writeout) { + vadd = ralloc(ctx, midgard_instruction); + *vadd = v_mov(~0, make_compiler_temp(ctx)); - if (!ctx->inputs->is_blend) { - vadd->op = midgard_alu_op_iadd; - vadd->src[0] = SSA_FIXED_REGISTER(31); - vadd->src_types[0] = nir_type_uint32; + if (!ctx->inputs->is_blend) { + vadd->op = midgard_alu_op_iadd; + vadd->src[0] = SSA_FIXED_REGISTER(31); + vadd->src_types[0] = nir_type_uint32; - for (unsigned c = 0; c < 16; ++c) - vadd->swizzle[0][c] = COMPONENT_X; + for (unsigned c = 0; c < 16; ++c) + vadd->swizzle[0][c] = COMPONENT_X; - vadd->has_inline_constant = true; - vadd->inline_constant = 0; - } else { - vadd->src[1] = SSA_FIXED_REGISTER(1); - vadd->src_types[0] = nir_type_uint32; + vadd->has_inline_constant = true; + vadd->inline_constant = 0; + } else { + vadd->src[1] = SSA_FIXED_REGISTER(1); + vadd->src_types[0] = nir_type_uint32; - for (unsigned c = 0; c < 16; ++c) - vadd->swizzle[1][c] = COMPONENT_W; - } + for (unsigned c = 0; c < 16; ++c) + vadd->swizzle[1][c] = COMPONENT_W; + } - vadd->unit = UNIT_VADD; - vadd->mask = 0x1; - branch->dest = vadd->dest; - branch->dest_type = vadd->dest_type; - } + vadd->unit = UNIT_VADD; + vadd->mask = 0x1; + branch->dest = vadd->dest; + branch->dest_type = vadd->dest_type; + } - if (writeout & PAN_WRITEOUT_Z) - mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, false); + if (writeout & PAN_WRITEOUT_Z) + mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, + len, branch, &smul, &vadd, &vlut, false); - if (writeout & PAN_WRITEOUT_S) - mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, true); + if (writeout & PAN_WRITEOUT_S) + mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, + len, branch, &smul, &vadd, &vlut, true); - mir_choose_alu(&smul, instructions, liveness, worklist, len, &predicate, UNIT_SMUL); + mir_choose_alu(&smul, instructions, liveness, worklist, len, &predicate, + UNIT_SMUL); - for (unsigned mode = 1; mode < 3; ++mode) { - predicate.move_mode = mode; - predicate.no_mask = writeout ? (1 << 3) : 0; - mir_choose_alu(&vlut, instructions, liveness, worklist, len, &predicate, UNIT_VLUT); - predicate.no_mask = 0; - mir_choose_alu(&vadd, instructions, liveness, worklist, len, &predicate, UNIT_VADD); - } + for (unsigned mode = 1; mode < 3; ++mode) { + predicate.move_mode = mode; + predicate.no_mask = writeout ? (1 << 3) : 0; + mir_choose_alu(&vlut, instructions, liveness, worklist, len, &predicate, + UNIT_VLUT); + predicate.no_mask = 0; + mir_choose_alu(&vadd, instructions, liveness, worklist, len, &predicate, + UNIT_VADD); + } - /* Reset */ - predicate.move_mode = 0; + /* Reset */ + predicate.move_mode = 0; - mir_update_worklist(worklist, len, instructions, vlut); - mir_update_worklist(worklist, len, instructions, vadd); - mir_update_worklist(worklist, len, instructions, smul); + mir_update_worklist(worklist, len, instructions, vlut); + mir_update_worklist(worklist, len, instructions, vadd); + mir_update_worklist(worklist, len, instructions, smul); - bool vadd_csel = vadd && OP_IS_CSEL(vadd->op); - bool smul_csel = smul && OP_IS_CSEL(smul->op); + bool vadd_csel = vadd && OP_IS_CSEL(vadd->op); + bool smul_csel = smul && OP_IS_CSEL(smul->op); - if (vadd_csel || smul_csel) { - midgard_instruction *ins = vadd_csel ? vadd : smul; - midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, ins); + if (vadd_csel || smul_csel) { + midgard_instruction *ins = vadd_csel ? vadd : smul; + midgard_instruction *cond = mir_schedule_condition( + ctx, &predicate, worklist, len, instructions, ins); - if (cond->unit == UNIT_VMUL) - vmul = cond; - else if (cond->unit == UNIT_SADD) - sadd = cond; - else - unreachable("Bad condition"); - } + if (cond->unit == UNIT_VMUL) + vmul = cond; + else if (cond->unit == UNIT_SADD) + sadd = cond; + else + unreachable("Bad condition"); + } - /* Stage 2, let's schedule sadd before vmul for writeout */ - mir_choose_alu(&sadd, instructions, liveness, worklist, len, &predicate, UNIT_SADD); + /* Stage 2, let's schedule sadd before vmul for writeout */ + mir_choose_alu(&sadd, instructions, liveness, worklist, len, &predicate, + UNIT_SADD); - /* Check if writeout reads its own register */ + /* Check if writeout reads its own register */ - if (writeout) { - midgard_instruction *stages[] = { sadd, vadd, smul, vlut }; - unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0]; - unsigned writeout_mask = 0x0; - bool bad_writeout = false; + if (writeout) { + midgard_instruction *stages[] = {sadd, vadd, smul, vlut}; + unsigned src = + (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0]; + unsigned writeout_mask = 0x0; + bool bad_writeout = false; - for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { - if (!stages[i]) - continue; + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (!stages[i]) + continue; - if (stages[i]->dest != src) - continue; + if (stages[i]->dest != src) + continue; - writeout_mask |= stages[i]->mask; - bad_writeout |= mir_has_arg(stages[i], branch->src[0]); - } + writeout_mask |= stages[i]->mask; + bad_writeout |= mir_has_arg(stages[i], branch->src[0]); + } - /* It's possible we'll be able to schedule something into vmul - * to fill r0. Let's peak into the future, trying to schedule - * vmul specially that way. */ + /* It's possible we'll be able to schedule something into vmul + * to fill r0. Let's peak into the future, trying to schedule + * vmul specially that way. */ - unsigned full_mask = 0xF; + unsigned full_mask = 0xF; - if (!bad_writeout && writeout_mask != full_mask) { - predicate.unit = UNIT_VMUL; - predicate.dest = src; - predicate.mask = writeout_mask ^ full_mask; + if (!bad_writeout && writeout_mask != full_mask) { + predicate.unit = UNIT_VMUL; + predicate.dest = src; + predicate.mask = writeout_mask ^ full_mask; - struct midgard_instruction *peaked = - mir_choose_instruction(instructions, liveness, worklist, len, &predicate); + struct midgard_instruction *peaked = mir_choose_instruction( + instructions, liveness, worklist, len, &predicate); - if (peaked) { - vmul = peaked; - vmul->unit = UNIT_VMUL; - writeout_mask |= predicate.mask; - assert(writeout_mask == full_mask); - } + if (peaked) { + vmul = peaked; + vmul->unit = UNIT_VMUL; + writeout_mask |= predicate.mask; + assert(writeout_mask == full_mask); + } - /* Cleanup */ - predicate.dest = predicate.mask = 0; - } + /* Cleanup */ + predicate.dest = predicate.mask = 0; + } - /* Finally, add a move if necessary */ - if (bad_writeout || writeout_mask != full_mask) { - unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : make_compiler_temp(ctx); + /* Finally, add a move if necessary */ + if (bad_writeout || writeout_mask != full_mask) { + unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) + : make_compiler_temp(ctx); - vmul = ralloc(ctx, midgard_instruction); - *vmul = v_mov(src, temp); - vmul->unit = UNIT_VMUL; - vmul->mask = full_mask ^ writeout_mask; + vmul = ralloc(ctx, midgard_instruction); + *vmul = v_mov(src, temp); + vmul->unit = UNIT_VMUL; + vmul->mask = full_mask ^ writeout_mask; - /* Rewrite to use our temp */ + /* Rewrite to use our temp */ - for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { - if (stages[i]) { - mir_rewrite_index_dst_single(stages[i], src, temp); - mir_rewrite_index_src_single(stages[i], src, temp); - } - } + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (stages[i]) { + mir_rewrite_index_dst_single(stages[i], src, temp); + mir_rewrite_index_src_single(stages[i], src, temp); + } + } - mir_rewrite_index_src_single(branch, src, temp); - } - } + mir_rewrite_index_src_single(branch, src, temp); + } + } - mir_choose_alu(&vmul, instructions, liveness, worklist, len, &predicate, UNIT_VMUL); + mir_choose_alu(&vmul, instructions, liveness, worklist, len, &predicate, + UNIT_VMUL); - mir_update_worklist(worklist, len, instructions, vmul); - mir_update_worklist(worklist, len, instructions, sadd); + mir_update_worklist(worklist, len, instructions, vmul); + mir_update_worklist(worklist, len, instructions, sadd); - bundle.has_embedded_constants = predicate.constant_mask != 0; + bundle.has_embedded_constants = predicate.constant_mask != 0; - unsigned padding = 0; + unsigned padding = 0; - /* Now that we have finished scheduling, build up the bundle */ - midgard_instruction *stages[] = { vmul, sadd, vadd, smul, vlut, branch }; + /* Now that we have finished scheduling, build up the bundle */ + midgard_instruction *stages[] = {vmul, sadd, vadd, smul, vlut, branch}; - for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { - if (stages[i]) { - bundle.control |= stages[i]->unit; - bytes_emitted += bytes_for_instruction(stages[i]); - bundle.instructions[bundle.instruction_count++] = stages[i]; + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (stages[i]) { + bundle.control |= stages[i]->unit; + bytes_emitted += bytes_for_instruction(stages[i]); + bundle.instructions[bundle.instruction_count++] = stages[i]; - /* If we branch, we can't spill to TLS since the store - * instruction will never get executed. We could try to - * break the bundle but this is probably easier for - * now. */ + /* If we branch, we can't spill to TLS since the store + * instruction will never get executed. We could try to + * break the bundle but this is probably easier for + * now. */ - if (branch) - stages[i]->no_spill |= (1 << REG_CLASS_WORK); - } - } + if (branch) + stages[i]->no_spill |= (1 << REG_CLASS_WORK); + } + } - /* Pad ALU op to nearest word */ + /* Pad ALU op to nearest word */ - if (bytes_emitted & 15) { - padding = 16 - (bytes_emitted & 15); - bytes_emitted += padding; - } + if (bytes_emitted & 15) { + padding = 16 - (bytes_emitted & 15); + bytes_emitted += padding; + } - /* Constants must always be quadwords */ - if (bundle.has_embedded_constants) - bytes_emitted += 16; + /* Constants must always be quadwords */ + if (bundle.has_embedded_constants) + bytes_emitted += 16; - /* Size ALU instruction for tag */ - bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; + /* Size ALU instruction for tag */ + bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; - bool tilebuf_wait = branch && branch->compact_branch && - branch->branch.target_type == TARGET_TILEBUF_WAIT; + bool tilebuf_wait = branch && branch->compact_branch && + branch->branch.target_type == TARGET_TILEBUF_WAIT; - /* MRT capable GPUs use a special writeout procedure */ - if ((writeout || tilebuf_wait) && !(ctx->quirks & MIDGARD_NO_UPPER_ALU)) - bundle.tag += 4; + /* MRT capable GPUs use a special writeout procedure */ + if ((writeout || tilebuf_wait) && !(ctx->quirks & MIDGARD_NO_UPPER_ALU)) + bundle.tag += 4; - bundle.padding = padding; - bundle.control |= bundle.tag; + bundle.padding = padding; + bundle.control |= bundle.tag; - return bundle; + return bundle; } /* Schedule a single block by iterating its instruction to create bundles. * While we go, tally about the bundle sizes to compute the block size. */ - static void schedule_block(compiler_context *ctx, midgard_block *block) { - /* Copy list to dynamic array */ - unsigned len = 0; - midgard_instruction **instructions = flatten_mir(block, &len); + /* Copy list to dynamic array */ + unsigned len = 0; + midgard_instruction **instructions = flatten_mir(block, &len); - if (!len) - return; + if (!len) + return; - /* Calculate dependencies and initial worklist */ - unsigned node_count = ctx->temp_count + 1; - mir_create_dependency_graph(instructions, len, node_count); + /* Calculate dependencies and initial worklist */ + unsigned node_count = ctx->temp_count + 1; + mir_create_dependency_graph(instructions, len, node_count); - /* Allocate the worklist */ - size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD); - BITSET_WORD *worklist = calloc(sz, 1); - uint16_t *liveness = calloc(node_count, 2); - mir_initialize_worklist(worklist, instructions, len); + /* Allocate the worklist */ + size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD); + BITSET_WORD *worklist = calloc(sz, 1); + uint16_t *liveness = calloc(node_count, 2); + mir_initialize_worklist(worklist, instructions, len); - /* Count the number of load/store instructions so we know when it's - * worth trying to schedule them in pairs. */ - unsigned num_ldst = 0; - for (unsigned i = 0; i < len; ++i) { - if (instructions[i]->type == TAG_LOAD_STORE_4) - ++num_ldst; - } + /* Count the number of load/store instructions so we know when it's + * worth trying to schedule them in pairs. */ + unsigned num_ldst = 0; + for (unsigned i = 0; i < len; ++i) { + if (instructions[i]->type == TAG_LOAD_STORE_4) + ++num_ldst; + } - struct util_dynarray bundles; - util_dynarray_init(&bundles, NULL); + struct util_dynarray bundles; + util_dynarray_init(&bundles, NULL); - block->quadword_count = 0; + block->quadword_count = 0; - for (;;) { - unsigned tag = mir_choose_bundle(instructions, liveness, worklist, len, num_ldst); - midgard_bundle bundle; + for (;;) { + unsigned tag = + mir_choose_bundle(instructions, liveness, worklist, len, num_ldst); + midgard_bundle bundle; - if (tag == TAG_TEXTURE_4) - bundle = mir_schedule_texture(instructions, liveness, worklist, len, ctx->stage != MESA_SHADER_FRAGMENT); - else if (tag == TAG_LOAD_STORE_4) - bundle = mir_schedule_ldst(instructions, liveness, worklist, len, &num_ldst); - else if (tag == TAG_ALU_4) - bundle = mir_schedule_alu(ctx, instructions, liveness, worklist, len); - else - break; + if (tag == TAG_TEXTURE_4) + bundle = mir_schedule_texture(instructions, liveness, worklist, len, + ctx->stage != MESA_SHADER_FRAGMENT); + else if (tag == TAG_LOAD_STORE_4) + bundle = + mir_schedule_ldst(instructions, liveness, worklist, len, &num_ldst); + else if (tag == TAG_ALU_4) + bundle = mir_schedule_alu(ctx, instructions, liveness, worklist, len); + else + break; - for (unsigned i = 0; i < bundle.instruction_count; ++i) - bundle.instructions[i]->bundle_id = - ctx->quadword_count + block->quadword_count; + for (unsigned i = 0; i < bundle.instruction_count; ++i) + bundle.instructions[i]->bundle_id = + ctx->quadword_count + block->quadword_count; - util_dynarray_append(&bundles, midgard_bundle, bundle); - block->quadword_count += midgard_tag_props[bundle.tag].size; - } + util_dynarray_append(&bundles, midgard_bundle, bundle); + block->quadword_count += midgard_tag_props[bundle.tag].size; + } - assert(num_ldst == 0); + assert(num_ldst == 0); - /* We emitted bundles backwards; copy into the block in reverse-order */ + /* We emitted bundles backwards; copy into the block in reverse-order */ - util_dynarray_init(&block->bundles, block); - util_dynarray_foreach_reverse(&bundles, midgard_bundle, bundle) { - util_dynarray_append(&block->bundles, midgard_bundle, *bundle); - } - util_dynarray_fini(&bundles); + util_dynarray_init(&block->bundles, block); + util_dynarray_foreach_reverse(&bundles, midgard_bundle, bundle) + { + util_dynarray_append(&block->bundles, midgard_bundle, *bundle); + } + util_dynarray_fini(&bundles); - block->scheduled = true; - ctx->quadword_count += block->quadword_count; + block->scheduled = true; + ctx->quadword_count += block->quadword_count; - /* Reorder instructions to match bundled. First remove existing - * instructions and then recreate the list */ + /* Reorder instructions to match bundled. First remove existing + * instructions and then recreate the list */ - mir_foreach_instr_in_block_safe(block, ins) { - list_del(&ins->link); - } + mir_foreach_instr_in_block_safe(block, ins) { + list_del(&ins->link); + } - mir_foreach_instr_in_block_scheduled_rev(block, ins) { - list_add(&ins->link, &block->base.instructions); - } + mir_foreach_instr_in_block_scheduled_rev(block, ins) { + list_add(&ins->link, &block->base.instructions); + } - free(instructions); /* Allocated by flatten_mir() */ - free(worklist); - free(liveness); + free(instructions); /* Allocated by flatten_mir() */ + free(worklist); + free(liveness); } /* Insert moves to ensure we can register allocate load/store registers */ static void mir_lower_ldst(compiler_context *ctx) { - mir_foreach_instr_global_safe(ctx, I) { - if (I->type != TAG_LOAD_STORE_4) continue; + mir_foreach_instr_global_safe(ctx, I) { + if (I->type != TAG_LOAD_STORE_4) + continue; - mir_foreach_src(I, s) { - if (s == 0) continue; - if (I->src[s] == ~0) continue; - if (I->swizzle[s][0] == 0) continue; + mir_foreach_src(I, s) { + if (s == 0) + continue; + if (I->src[s] == ~0) + continue; + if (I->swizzle[s][0] == 0) + continue; - unsigned temp = make_compiler_temp(ctx); - midgard_instruction mov = v_mov(I->src[s], temp); - mov.mask = 0x1; - mov.dest_type = I->src_types[s]; - for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) - mov.swizzle[1][c] = I->swizzle[s][0]; + unsigned temp = make_compiler_temp(ctx); + midgard_instruction mov = v_mov(I->src[s], temp); + mov.mask = 0x1; + mov.dest_type = I->src_types[s]; + for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) + mov.swizzle[1][c] = I->swizzle[s][0]; - mir_insert_instruction_before(ctx, I, mov); - I->src[s] = mov.dest; - I->swizzle[s][0] = 0; - } - } + mir_insert_instruction_before(ctx, I, mov); + I->src[s] = mov.dest; + I->swizzle[s][0] = 0; + } + } } /* Insert moves to ensure we can register allocate blend writeout */ static void mir_lower_blend_input(compiler_context *ctx) { - mir_foreach_block(ctx, _blk) { - midgard_block *blk = (midgard_block *) _blk; + mir_foreach_block(ctx, _blk) { + midgard_block *blk = (midgard_block *)_blk; - if (list_is_empty(&blk->base.instructions)) - continue; + if (list_is_empty(&blk->base.instructions)) + continue; - midgard_instruction *I = mir_last_in_block(blk); + midgard_instruction *I = mir_last_in_block(blk); - if (!I || I->type != TAG_ALU_4 || !I->writeout) - continue; + if (!I || I->type != TAG_ALU_4 || !I->writeout) + continue; - mir_foreach_src(I, s) { - unsigned src = I->src[s]; + mir_foreach_src(I, s) { + unsigned src = I->src[s]; - if (src >= ctx->temp_count) - continue; + if (src >= ctx->temp_count) + continue; - if (!_blk->live_out[src]) - continue; + if (!_blk->live_out[src]) + continue; - unsigned temp = make_compiler_temp(ctx); - midgard_instruction mov = v_mov(src, temp); - mov.mask = 0xF; - mov.dest_type = nir_type_uint32; - mir_insert_instruction_before(ctx, I, mov); - I->src[s] = mov.dest; - } - } + unsigned temp = make_compiler_temp(ctx); + midgard_instruction mov = v_mov(src, temp); + mov.mask = 0xF; + mov.dest_type = nir_type_uint32; + mir_insert_instruction_before(ctx, I, mov); + I->src[s] = mov.dest; + } + } } void midgard_schedule_program(compiler_context *ctx) { - mir_lower_ldst(ctx); - midgard_promote_uniforms(ctx); + mir_lower_ldst(ctx); + midgard_promote_uniforms(ctx); - /* Must be lowered right before scheduling */ - mir_squeeze_index(ctx); - mir_lower_special_reads(ctx); + /* Must be lowered right before scheduling */ + mir_squeeze_index(ctx); + mir_lower_special_reads(ctx); - if (ctx->stage == MESA_SHADER_FRAGMENT) { - mir_invalidate_liveness(ctx); - mir_compute_liveness(ctx); - mir_lower_blend_input(ctx); - } + if (ctx->stage == MESA_SHADER_FRAGMENT) { + mir_invalidate_liveness(ctx); + mir_compute_liveness(ctx); + mir_lower_blend_input(ctx); + } - mir_squeeze_index(ctx); + mir_squeeze_index(ctx); - /* Lowering can introduce some dead moves */ + /* Lowering can introduce some dead moves */ - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - midgard_opt_dead_move_eliminate(ctx, block); - schedule_block(ctx, block); - } + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + midgard_opt_dead_move_eliminate(ctx, block); + schedule_block(ctx, block); + } } diff --git a/src/panfrost/midgard/mir.c b/src/panfrost/midgard/mir.c index a4ea28fcdc8..07cbfc89236 100644 --- a/src/panfrost/midgard/mir.c +++ b/src/panfrost/midgard/mir.c @@ -25,86 +25,93 @@ #include "compiler.h" #include "midgard_ops.h" -void mir_rewrite_index_src_single(midgard_instruction *ins, unsigned old, unsigned new) +void +mir_rewrite_index_src_single(midgard_instruction *ins, unsigned old, + unsigned new) { - mir_foreach_src(ins, i) { - if (ins->src[i] == old) - ins->src[i] = new; - } + mir_foreach_src(ins, i) { + if (ins->src[i] == old) + ins->src[i] = new; + } } -void mir_rewrite_index_dst_single(midgard_instruction *ins, unsigned old, unsigned new) +void +mir_rewrite_index_dst_single(midgard_instruction *ins, unsigned old, + unsigned new) { - if (ins->dest == old) - ins->dest = new; + if (ins->dest == old) + ins->dest = new; } static void -mir_rewrite_index_src_single_swizzle(midgard_instruction *ins, unsigned old, unsigned new, unsigned *swizzle) +mir_rewrite_index_src_single_swizzle(midgard_instruction *ins, unsigned old, + unsigned new, unsigned *swizzle) { - for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) { - if (ins->src[i] != old) continue; + for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) { + if (ins->src[i] != old) + continue; - ins->src[i] = new; - mir_compose_swizzle(ins->swizzle[i], swizzle, ins->swizzle[i]); - } + ins->src[i] = new; + mir_compose_swizzle(ins->swizzle[i], swizzle, ins->swizzle[i]); + } } void mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new) { - mir_foreach_instr_global(ctx, ins) { - mir_rewrite_index_src_single(ins, old, new); - } + mir_foreach_instr_global(ctx, ins) { + mir_rewrite_index_src_single(ins, old, new); + } } void -mir_rewrite_index_src_swizzle(compiler_context *ctx, unsigned old, unsigned new, unsigned *swizzle) +mir_rewrite_index_src_swizzle(compiler_context *ctx, unsigned old, unsigned new, + unsigned *swizzle) { - mir_foreach_instr_global(ctx, ins) { - mir_rewrite_index_src_single_swizzle(ins, old, new, swizzle); - } + mir_foreach_instr_global(ctx, ins) { + mir_rewrite_index_src_single_swizzle(ins, old, new, swizzle); + } } void mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new) { - mir_foreach_instr_global(ctx, ins) { - mir_rewrite_index_dst_single(ins, old, new); - } + mir_foreach_instr_global(ctx, ins) { + mir_rewrite_index_dst_single(ins, old, new); + } - /* Implicitly written before the shader */ - if (ctx->blend_input == old) - ctx->blend_input = new; + /* Implicitly written before the shader */ + if (ctx->blend_input == old) + ctx->blend_input = new; - if (ctx->blend_src1 == old) - ctx->blend_src1 = new; + if (ctx->blend_src1 == old) + ctx->blend_src1 = new; } void mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new) { - mir_rewrite_index_src(ctx, old, new); - mir_rewrite_index_dst(ctx, old, new); + mir_rewrite_index_src(ctx, old, new); + mir_rewrite_index_dst(ctx, old, new); } unsigned mir_use_count(compiler_context *ctx, unsigned value) { - unsigned used_count = 0; + unsigned used_count = 0; - mir_foreach_instr_global(ctx, ins) { - if (mir_has_arg(ins, value)) - ++used_count; - } + mir_foreach_instr_global(ctx, ins) { + if (mir_has_arg(ins, value)) + ++used_count; + } - if (ctx->blend_input == value) - ++used_count; + if (ctx->blend_input == value) + ++used_count; - if (ctx->blend_src1 == value) - ++used_count; + if (ctx->blend_src1 == value) + ++used_count; - return used_count; + return used_count; } /* Checks if a value is used only once (or totally dead), which is an important @@ -113,50 +120,56 @@ mir_use_count(compiler_context *ctx, unsigned value) bool mir_single_use(compiler_context *ctx, unsigned value) { - /* We can replicate constants in places so who cares */ - if (value == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) - return true; + /* We can replicate constants in places so who cares */ + if (value == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) + return true; - return mir_use_count(ctx, value) <= 1; + return mir_use_count(ctx, value) <= 1; } bool mir_nontrivial_mod(midgard_instruction *ins, unsigned i, bool check_swizzle) { - bool is_int = midgard_is_integer_op(ins->op); + bool is_int = midgard_is_integer_op(ins->op); - if (is_int) { - if (ins->src_shift[i]) return true; - } else { - if (ins->src_neg[i]) return true; - if (ins->src_abs[i]) return true; - } + if (is_int) { + if (ins->src_shift[i]) + return true; + } else { + if (ins->src_neg[i]) + return true; + if (ins->src_abs[i]) + return true; + } - if (ins->dest_type != ins->src_types[i]) return true; + if (ins->dest_type != ins->src_types[i]) + return true; - if (check_swizzle) { - for (unsigned c = 0; c < 16; ++c) { - if (!(ins->mask & (1 << c))) continue; - if (ins->swizzle[i][c] != c) return true; - } - } + if (check_swizzle) { + for (unsigned c = 0; c < 16; ++c) { + if (!(ins->mask & (1 << c))) + continue; + if (ins->swizzle[i][c] != c) + return true; + } + } - return false; + return false; } bool mir_nontrivial_outmod(midgard_instruction *ins) { - bool is_int = midgard_is_integer_op(ins->op); - unsigned mod = ins->outmod; + bool is_int = midgard_is_integer_op(ins->op); + unsigned mod = ins->outmod; - if (ins->dest_type != ins->src_types[1]) - return true; + if (ins->dest_type != ins->src_types[1]) + return true; - if (is_int) - return mod != midgard_outmod_keeplo; - else - return mod != midgard_outmod_none; + if (is_int) + return mod != midgard_outmod_keeplo; + else + return mod != midgard_outmod_none; } /* 128 / sz = exp2(log2(128 / sz)) @@ -168,32 +181,32 @@ mir_nontrivial_outmod(midgard_instruction *ins) static unsigned mir_components_for_bits(unsigned bits) { - return 1 << (7 - util_logbase2(bits)); + return 1 << (7 - util_logbase2(bits)); } unsigned mir_components_for_type(nir_alu_type T) { - unsigned sz = nir_alu_type_get_type_size(T); - return mir_components_for_bits(sz); + unsigned sz = nir_alu_type_get_type_size(T); + return mir_components_for_bits(sz); } uint16_t mir_from_bytemask(uint16_t bytemask, unsigned bits) { - unsigned value = 0; - unsigned count = bits / 8; + unsigned value = 0; + unsigned count = bits / 8; - for (unsigned c = 0, d = 0; c < 16; c += count, ++d) { - bool a = (bytemask & (1 << c)) != 0; + for (unsigned c = 0, d = 0; c < 16; c += count, ++d) { + bool a = (bytemask & (1 << c)) != 0; - for (unsigned q = c; q < count; ++q) - assert(((bytemask & (1 << q)) != 0) == a); + for (unsigned q = c; q < count; ++q) + assert(((bytemask & (1 << q)) != 0) == a); - value |= (a << d); - } + value |= (a << d); + } - return value; + return value; } /* Rounds up a bytemask to fill a given component count. Iterate each @@ -202,18 +215,18 @@ mir_from_bytemask(uint16_t bytemask, unsigned bits) uint16_t mir_round_bytemask_up(uint16_t mask, unsigned bits) { - unsigned bytes = bits / 8; - unsigned maxmask = mask_of(bytes); - unsigned channels = mir_components_for_bits(bits); + unsigned bytes = bits / 8; + unsigned maxmask = mask_of(bytes); + unsigned channels = mir_components_for_bits(bits); - for (unsigned c = 0; c < channels; ++c) { - unsigned submask = maxmask << (c * bytes); + for (unsigned c = 0; c < channels; ++c) { + unsigned submask = maxmask << (c * bytes); - if (mask & submask) - mask |= submask; - } + if (mask & submask) + mask |= submask; + } - return mask; + return mask; } /* Grabs the per-byte mask of an instruction (as opposed to per-component) */ @@ -221,15 +234,15 @@ mir_round_bytemask_up(uint16_t mask, unsigned bits) uint16_t mir_bytemask(midgard_instruction *ins) { - unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); - return pan_to_bytemask(type_size, ins->mask); + unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); + return pan_to_bytemask(type_size, ins->mask); } void mir_set_bytemask(midgard_instruction *ins, uint16_t bytemask) { - unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); - ins->mask = mir_from_bytemask(bytemask, type_size); + unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); + ins->mask = mir_from_bytemask(bytemask, type_size); } /* @@ -240,24 +253,24 @@ mir_set_bytemask(midgard_instruction *ins, uint16_t bytemask) signed mir_upper_override(midgard_instruction *ins, unsigned inst_size) { - unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); + unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); - /* If the sizes are the same, there's nothing to override */ - if (type_size == inst_size) - return -1; + /* If the sizes are the same, there's nothing to override */ + if (type_size == inst_size) + return -1; - /* There are 16 bytes per vector, so there are (16/bytes) - * components per vector. So the magic half is half of - * (16/bytes), which simplifies to 8/bytes = 8 / (bits / 8) = 64 / bits - * */ + /* There are 16 bytes per vector, so there are (16/bytes) + * components per vector. So the magic half is half of + * (16/bytes), which simplifies to 8/bytes = 8 / (bits / 8) = 64 / bits + * */ - unsigned threshold = mir_components_for_bits(type_size) >> 1; + unsigned threshold = mir_components_for_bits(type_size) >> 1; - /* How many components did we shift over? */ - unsigned zeroes = __builtin_ctz(ins->mask); + /* How many components did we shift over? */ + unsigned zeroes = __builtin_ctz(ins->mask); - /* Did we hit the threshold? */ - return (zeroes >= threshold) ? threshold : 0; + /* Did we hit the threshold? */ + return (zeroes >= threshold) ? threshold : 0; } /* Creates a mask of the components of a node read by an instruction, by @@ -269,60 +282,64 @@ mir_upper_override(midgard_instruction *ins, unsigned inst_size) */ static uint16_t -mir_bytemask_of_read_components_single(unsigned *swizzle, unsigned inmask, unsigned bits) +mir_bytemask_of_read_components_single(unsigned *swizzle, unsigned inmask, + unsigned bits) { - unsigned cmask = 0; + unsigned cmask = 0; - for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { - if (!(inmask & (1 << c))) continue; - cmask |= (1 << swizzle[c]); - } + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { + if (!(inmask & (1 << c))) + continue; + cmask |= (1 << swizzle[c]); + } - return pan_to_bytemask(bits, cmask); + return pan_to_bytemask(bits, cmask); } uint16_t mir_bytemask_of_read_components_index(midgard_instruction *ins, unsigned i) { - /* Conditional branches read one 32-bit component = 4 bytes (TODO: multi branch??) */ - if (ins->compact_branch && ins->branch.conditional && (i == 0)) - return 0xF; + /* Conditional branches read one 32-bit component = 4 bytes (TODO: multi + * branch??) */ + if (ins->compact_branch && ins->branch.conditional && (i == 0)) + return 0xF; - /* ALU ops act componentwise so we need to pay attention to - * their mask. Texture/ldst does not so we don't clamp source - * readmasks based on the writemask */ - unsigned qmask = ~0; + /* ALU ops act componentwise so we need to pay attention to + * their mask. Texture/ldst does not so we don't clamp source + * readmasks based on the writemask */ + unsigned qmask = ~0; - /* Handle dot products and things */ - if (ins->type == TAG_ALU_4 && !ins->compact_branch) { - unsigned props = alu_opcode_props[ins->op].props; + /* Handle dot products and things */ + if (ins->type == TAG_ALU_4 && !ins->compact_branch) { + unsigned props = alu_opcode_props[ins->op].props; - unsigned channel_override = GET_CHANNEL_COUNT(props); + unsigned channel_override = GET_CHANNEL_COUNT(props); - if (channel_override) - qmask = mask_of(channel_override); - else - qmask = ins->mask; - } + if (channel_override) + qmask = mask_of(channel_override); + else + qmask = ins->mask; + } - return mir_bytemask_of_read_components_single(ins->swizzle[i], qmask, - nir_alu_type_get_type_size(ins->src_types[i])); + return mir_bytemask_of_read_components_single( + ins->swizzle[i], qmask, nir_alu_type_get_type_size(ins->src_types[i])); } uint16_t mir_bytemask_of_read_components(midgard_instruction *ins, unsigned node) { - uint16_t mask = 0; + uint16_t mask = 0; - if (node == ~0) - return 0; + if (node == ~0) + return 0; - mir_foreach_src(ins, i) { - if (ins->src[i] != node) continue; - mask |= mir_bytemask_of_read_components_index(ins, i); - } + mir_foreach_src(ins, i) { + if (ins->src[i] != node) + continue; + mask |= mir_bytemask_of_read_components_index(ins, i); + } - return mask; + return mask; } /* Register allocation occurs after instruction scheduling, which is fine until @@ -335,94 +352,97 @@ mir_bytemask_of_read_components(midgard_instruction *ins, unsigned node) static midgard_bundle mir_bundle_for_op(compiler_context *ctx, midgard_instruction ins) { - midgard_instruction *u = mir_upload_ins(ctx, ins); + midgard_instruction *u = mir_upload_ins(ctx, ins); - midgard_bundle bundle = { - .tag = ins.type, - .instruction_count = 1, - .instructions = { u }, - }; + midgard_bundle bundle = { + .tag = ins.type, + .instruction_count = 1, + .instructions = {u}, + }; - if (bundle.tag == TAG_ALU_4) { - assert(OP_IS_MOVE(u->op)); - u->unit = UNIT_VMUL; + if (bundle.tag == TAG_ALU_4) { + assert(OP_IS_MOVE(u->op)); + u->unit = UNIT_VMUL; - size_t bytes_emitted = sizeof(uint32_t) + sizeof(midgard_reg_info) + sizeof(midgard_vector_alu); - bundle.padding = ~(bytes_emitted - 1) & 0xF; - bundle.control = ins.type | u->unit; - } + size_t bytes_emitted = sizeof(uint32_t) + sizeof(midgard_reg_info) + + sizeof(midgard_vector_alu); + bundle.padding = ~(bytes_emitted - 1) & 0xF; + bundle.control = ins.type | u->unit; + } - return bundle; + return bundle; } static unsigned mir_bundle_idx_for_ins(midgard_instruction *tag, midgard_block *block) { - midgard_bundle *bundles = - (midgard_bundle *) block->bundles.data; + midgard_bundle *bundles = (midgard_bundle *)block->bundles.data; - size_t count = (block->bundles.size / sizeof(midgard_bundle)); + size_t count = (block->bundles.size / sizeof(midgard_bundle)); - for (unsigned i = 0; i < count; ++i) { - for (unsigned j = 0; j < bundles[i].instruction_count; ++j) { - if (bundles[i].instructions[j] == tag) - return i; - } - } + for (unsigned i = 0; i < count; ++i) { + for (unsigned j = 0; j < bundles[i].instruction_count; ++j) { + if (bundles[i].instructions[j] == tag) + return i; + } + } - mir_print_instruction(tag); - unreachable("Instruction not scheduled in block"); + mir_print_instruction(tag); + unreachable("Instruction not scheduled in block"); } midgard_instruction * -mir_insert_instruction_before_scheduled( - compiler_context *ctx, - midgard_block *block, - midgard_instruction *tag, - midgard_instruction ins) +mir_insert_instruction_before_scheduled(compiler_context *ctx, + midgard_block *block, + midgard_instruction *tag, + midgard_instruction ins) { - unsigned before = mir_bundle_idx_for_ins(tag, block); - size_t count = util_dynarray_num_elements(&block->bundles, midgard_bundle); - UNUSED void *unused = util_dynarray_grow(&block->bundles, midgard_bundle, 1); + unsigned before = mir_bundle_idx_for_ins(tag, block); + size_t count = util_dynarray_num_elements(&block->bundles, midgard_bundle); + UNUSED void *unused = util_dynarray_grow(&block->bundles, midgard_bundle, 1); - midgard_bundle *bundles = (midgard_bundle *) block->bundles.data; - memmove(bundles + before + 1, bundles + before, (count - before) * sizeof(midgard_bundle)); - midgard_bundle *before_bundle = bundles + before + 1; + midgard_bundle *bundles = (midgard_bundle *)block->bundles.data; + memmove(bundles + before + 1, bundles + before, + (count - before) * sizeof(midgard_bundle)); + midgard_bundle *before_bundle = bundles + before + 1; - midgard_bundle new = mir_bundle_for_op(ctx, ins); - memcpy(bundles + before, &new, sizeof(new)); + midgard_bundle new = mir_bundle_for_op(ctx, ins); + memcpy(bundles + before, &new, sizeof(new)); - list_addtail(&new.instructions[0]->link, &before_bundle->instructions[0]->link); - block->quadword_count += midgard_tag_props[new.tag].size; + list_addtail(&new.instructions[0]->link, + &before_bundle->instructions[0]->link); + block->quadword_count += midgard_tag_props[new.tag].size; - return new.instructions[0]; + return new.instructions[0]; } midgard_instruction * -mir_insert_instruction_after_scheduled( - compiler_context *ctx, - midgard_block *block, - midgard_instruction *tag, - midgard_instruction ins) +mir_insert_instruction_after_scheduled(compiler_context *ctx, + midgard_block *block, + midgard_instruction *tag, + midgard_instruction ins) { - /* We need to grow the bundles array to add our new bundle */ - size_t count = util_dynarray_num_elements(&block->bundles, midgard_bundle); - UNUSED void *unused = util_dynarray_grow(&block->bundles, midgard_bundle, 1); + /* We need to grow the bundles array to add our new bundle */ + size_t count = util_dynarray_num_elements(&block->bundles, midgard_bundle); + UNUSED void *unused = util_dynarray_grow(&block->bundles, midgard_bundle, 1); - /* Find the bundle that we want to insert after */ - unsigned after = mir_bundle_idx_for_ins(tag, block); + /* Find the bundle that we want to insert after */ + unsigned after = mir_bundle_idx_for_ins(tag, block); - /* All the bundles after that one, we move ahead by one */ - midgard_bundle *bundles = (midgard_bundle *) block->bundles.data; - memmove(bundles + after + 2, bundles + after + 1, (count - after - 1) * sizeof(midgard_bundle)); - midgard_bundle *after_bundle = bundles + after; + /* All the bundles after that one, we move ahead by one */ + midgard_bundle *bundles = (midgard_bundle *)block->bundles.data; + memmove(bundles + after + 2, bundles + after + 1, + (count - after - 1) * sizeof(midgard_bundle)); + midgard_bundle *after_bundle = bundles + after; - midgard_bundle new = mir_bundle_for_op(ctx, ins); - memcpy(bundles + after + 1, &new, sizeof(new)); - list_add(&new.instructions[0]->link, &after_bundle->instructions[after_bundle->instruction_count - 1]->link); - block->quadword_count += midgard_tag_props[new.tag].size; + midgard_bundle new = mir_bundle_for_op(ctx, ins); + memcpy(bundles + after + 1, &new, sizeof(new)); + list_add( + &new.instructions[0]->link, + &after_bundle->instructions[after_bundle->instruction_count - 1]->link); + block->quadword_count += midgard_tag_props[new.tag].size; - return new.instructions[0]; + return new.instructions[0]; } /* Flip the first-two arguments of a (binary) op. Currently ALU @@ -431,32 +451,32 @@ mir_insert_instruction_after_scheduled( void mir_flip(midgard_instruction *ins) { - unsigned temp = ins->src[0]; - ins->src[0] = ins->src[1]; - ins->src[1] = temp; + unsigned temp = ins->src[0]; + ins->src[0] = ins->src[1]; + ins->src[1] = temp; - assert(ins->type == TAG_ALU_4); + assert(ins->type == TAG_ALU_4); - temp = ins->src_types[0]; - ins->src_types[0] = ins->src_types[1]; - ins->src_types[1] = temp; + temp = ins->src_types[0]; + ins->src_types[0] = ins->src_types[1]; + ins->src_types[1] = temp; - temp = ins->src_abs[0]; - ins->src_abs[0] = ins->src_abs[1]; - ins->src_abs[1] = temp; + temp = ins->src_abs[0]; + ins->src_abs[0] = ins->src_abs[1]; + ins->src_abs[1] = temp; - temp = ins->src_neg[0]; - ins->src_neg[0] = ins->src_neg[1]; - ins->src_neg[1] = temp; + temp = ins->src_neg[0]; + ins->src_neg[0] = ins->src_neg[1]; + ins->src_neg[1] = temp; - temp = ins->src_invert[0]; - ins->src_invert[0] = ins->src_invert[1]; - ins->src_invert[1] = temp; + temp = ins->src_invert[0]; + ins->src_invert[0] = ins->src_invert[1]; + ins->src_invert[1] = temp; - unsigned temp_swizzle[16]; - memcpy(temp_swizzle, ins->swizzle[0], sizeof(ins->swizzle[0])); - memcpy(ins->swizzle[0], ins->swizzle[1], sizeof(ins->swizzle[0])); - memcpy(ins->swizzle[1], temp_swizzle, sizeof(ins->swizzle[0])); + unsigned temp_swizzle[16]; + memcpy(temp_swizzle, ins->swizzle[0], sizeof(ins->swizzle[0])); + memcpy(ins->swizzle[0], ins->swizzle[1], sizeof(ins->swizzle[0])); + memcpy(ins->swizzle[1], temp_swizzle, sizeof(ins->swizzle[0])); } /* Before squashing, calculate ctx->temp_count just by observing the MIR */ @@ -464,15 +484,15 @@ mir_flip(midgard_instruction *ins) void mir_compute_temp_count(compiler_context *ctx) { - if (ctx->temp_count) - return; + if (ctx->temp_count) + return; - unsigned max_dest = 0; + unsigned max_dest = 0; - mir_foreach_instr_global(ctx, ins) { - if (ins->dest < SSA_FIXED_MINIMUM) - max_dest = MAX2(max_dest, ins->dest + 1); - } + mir_foreach_instr_global(ctx, ins) { + if (ins->dest < SSA_FIXED_MINIMUM) + max_dest = MAX2(max_dest, ins->dest + 1); + } - ctx->temp_count = max_dest; + ctx->temp_count = max_dest; } diff --git a/src/panfrost/midgard/mir_promote_uniforms.c b/src/panfrost/midgard/mir_promote_uniforms.c index a6396749f48..c4909994712 100644 --- a/src/panfrost/midgard/mir_promote_uniforms.c +++ b/src/panfrost/midgard/mir_promote_uniforms.c @@ -24,9 +24,9 @@ * Alyssa Rosenzweig */ -#include "compiler.h" #include "util/u_math.h" #include "util/u_memory.h" +#include "compiler.h" /* This pass promotes reads from UBOs to register-mapped uniforms. This saves * both instructions and work register pressure, but it reduces the work @@ -39,17 +39,14 @@ static bool mir_is_ubo(midgard_instruction *ins) { - return (ins->type == TAG_LOAD_STORE_4) && - (OP_IS_UBO_READ(ins->op)); + return (ins->type == TAG_LOAD_STORE_4) && (OP_IS_UBO_READ(ins->op)); } static bool mir_is_direct_aligned_ubo(midgard_instruction *ins) { - return mir_is_ubo(ins) && - !(ins->constants.u32[0] & 0xF) && - (ins->src[1] == ~0) && - (ins->src[2] == ~0); + return mir_is_ubo(ins) && !(ins->constants.u32[0] & 0xF) && + (ins->src[1] == ~0) && (ins->src[2] == ~0); } /* Represents use data for a single UBO */ @@ -57,38 +54,39 @@ mir_is_direct_aligned_ubo(midgard_instruction *ins) #define MAX_UBO_QWORDS (65536 / 16) struct mir_ubo_block { - BITSET_DECLARE(uses, MAX_UBO_QWORDS); - BITSET_DECLARE(pushed, MAX_UBO_QWORDS); + BITSET_DECLARE(uses, MAX_UBO_QWORDS); + BITSET_DECLARE(pushed, MAX_UBO_QWORDS); }; struct mir_ubo_analysis { - /* Per block analysis */ - unsigned nr_blocks; - struct mir_ubo_block *blocks; + /* Per block analysis */ + unsigned nr_blocks; + struct mir_ubo_block *blocks; }; static struct mir_ubo_analysis mir_analyze_ranges(compiler_context *ctx) { - struct mir_ubo_analysis res = { - .nr_blocks = ctx->nir->info.num_ubos + 1, - }; + struct mir_ubo_analysis res = { + .nr_blocks = ctx->nir->info.num_ubos + 1, + }; - res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block)); + res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block)); - mir_foreach_instr_global(ctx, ins) { - if (!mir_is_direct_aligned_ubo(ins)) continue; + mir_foreach_instr_global(ctx, ins) { + if (!mir_is_direct_aligned_ubo(ins)) + continue; - unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store); - unsigned offset = ins->constants.u32[0] / 16; + unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store); + unsigned offset = ins->constants.u32[0] / 16; - assert(ubo < res.nr_blocks); + assert(ubo < res.nr_blocks); - if (offset < MAX_UBO_QWORDS) - BITSET_SET(res.blocks[ubo].uses, offset); - } + if (offset < MAX_UBO_QWORDS) + BITSET_SET(res.blocks[ubo].uses, offset); + } - return res; + return res; } /* Select UBO words to push. A sophisticated implementation would consider the @@ -96,32 +94,33 @@ mir_analyze_ranges(compiler_context *ctx) * sophisticated. Select from the last UBO first to prioritize sysvals. */ static void -mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords) +mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, + unsigned max_qwords) { - unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4); + unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4); - for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { - struct mir_ubo_block *block = &analysis->blocks[ubo]; + for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { + struct mir_ubo_block *block = &analysis->blocks[ubo]; - unsigned vec4; - BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) { - /* Don't push more than possible */ - if (push->count > max_words - 4) - return; + unsigned vec4; + BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) { + /* Don't push more than possible */ + if (push->count > max_words - 4) + return; - for (unsigned offs = 0; offs < 4; ++offs) { - struct panfrost_ubo_word word = { - .ubo = ubo, - .offset = (vec4 * 16) + (offs * 4), - }; + for (unsigned offs = 0; offs < 4; ++offs) { + struct panfrost_ubo_word word = { + .ubo = ubo, + .offset = (vec4 * 16) + (offs * 4), + }; - push->words[push->count++] = word; - } + push->words[push->count++] = word; + } - /* Mark it as pushed so we can rewrite */ - BITSET_SET(block->pushed, vec4); - } - } + /* Mark it as pushed so we can rewrite */ + BITSET_SET(block->pushed, vec4); + } + } } #if 0 @@ -154,80 +153,81 @@ mir_dump_ubo_analysis(struct mir_ubo_analysis *res) static unsigned mir_promoteable_uniform_count(struct mir_ubo_analysis *analysis) { - unsigned count = 0; + unsigned count = 0; - for (unsigned i = 0; i < analysis->nr_blocks; ++i) { - BITSET_WORD *uses = analysis->blocks[i].uses; + for (unsigned i = 0; i < analysis->nr_blocks; ++i) { + BITSET_WORD *uses = analysis->blocks[i].uses; - for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w) - count += util_bitcount(uses[w]); - } + for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w) + count += util_bitcount(uses[w]); + } - return count; + return count; } static unsigned mir_count_live(uint16_t *live, unsigned temp_count) { - unsigned count = 0; + unsigned count = 0; - for (unsigned i = 0; i < temp_count; ++i) - count += util_bitcount(live[i]); + for (unsigned i = 0; i < temp_count; ++i) + count += util_bitcount(live[i]); - return count; + return count; } static unsigned mir_estimate_pressure(compiler_context *ctx) { - mir_invalidate_liveness(ctx); - mir_compute_liveness(ctx); + mir_invalidate_liveness(ctx); + mir_compute_liveness(ctx); - unsigned max_live = 0; + unsigned max_live = 0; - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - uint16_t *live = mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t)); + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + uint16_t *live = + mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t)); - mir_foreach_instr_in_block_rev(block, ins) { - unsigned count = mir_count_live(live, ctx->temp_count); - max_live = MAX2(max_live, count); - mir_liveness_ins_update(live, ins, ctx->temp_count); - } + mir_foreach_instr_in_block_rev(block, ins) { + unsigned count = mir_count_live(live, ctx->temp_count); + max_live = MAX2(max_live, count); + mir_liveness_ins_update(live, ins, ctx->temp_count); + } - free(live); - } + free(live); + } - return DIV_ROUND_UP(max_live, 16); + return DIV_ROUND_UP(max_live, 16); } static unsigned mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis) { - unsigned uniform_count = mir_promoteable_uniform_count(analysis); + unsigned uniform_count = mir_promoteable_uniform_count(analysis); - /* If there are 8 or fewer uniforms, it doesn't matter what we do, so - * allow as many work registers as needed */ + /* If there are 8 or fewer uniforms, it doesn't matter what we do, so + * allow as many work registers as needed */ - if (uniform_count <= 8) - return 16; + if (uniform_count <= 8) + return 16; - /* Otherwise, estimate the register pressure */ + /* Otherwise, estimate the register pressure */ - unsigned pressure = mir_estimate_pressure(ctx); + unsigned pressure = mir_estimate_pressure(ctx); - /* Prioritize not spilling above all else. The relation between the - * pressure estimate and the actual register pressure is a little - * murkier than we might like (due to scheduling, pipeline registers, - * failure to pack vector registers, load/store registers, texture - * registers...), hence why this is a heuristic parameter */ + /* Prioritize not spilling above all else. The relation between the + * pressure estimate and the actual register pressure is a little + * murkier than we might like (due to scheduling, pipeline registers, + * failure to pack vector registers, load/store registers, texture + * registers...), hence why this is a heuristic parameter */ - if (pressure > 6) - return 16; + if (pressure > 6) + return 16; - /* If there's no chance of spilling, prioritize UBOs and thread count */ + /* If there's no chance of spilling, prioritize UBOs and thread count */ - return 8; + return 8; } /* Bitset of indices that will be used as a special register -- inputs to a @@ -237,111 +237,113 @@ mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis) static BITSET_WORD * mir_special_indices(compiler_context *ctx) { - mir_compute_temp_count(ctx); - BITSET_WORD *bset = calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD)); + mir_compute_temp_count(ctx); + BITSET_WORD *bset = + calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD)); - mir_foreach_instr_global(ctx, ins) { - /* Look for special instructions */ - bool is_ldst = ins->type == TAG_LOAD_STORE_4; - bool is_tex = ins->type == TAG_TEXTURE_4; - bool is_writeout = ins->compact_branch && ins->writeout; + mir_foreach_instr_global(ctx, ins) { + /* Look for special instructions */ + bool is_ldst = ins->type == TAG_LOAD_STORE_4; + bool is_tex = ins->type == TAG_TEXTURE_4; + bool is_writeout = ins->compact_branch && ins->writeout; - if (!(is_ldst || is_tex || is_writeout)) - continue; + if (!(is_ldst || is_tex || is_writeout)) + continue; - /* Anything read by a special instruction is itself special */ - mir_foreach_src(ins, i) { - unsigned idx = ins->src[i]; + /* Anything read by a special instruction is itself special */ + mir_foreach_src(ins, i) { + unsigned idx = ins->src[i]; - if (idx < ctx->temp_count) - BITSET_SET(bset, idx); - } - } + if (idx < ctx->temp_count) + BITSET_SET(bset, idx); + } + } - return bset; + return bset; } void midgard_promote_uniforms(compiler_context *ctx) { - if (ctx->inputs->no_ubo_to_push) { - /* If nothing is pushed, all UBOs need to be uploaded - * conventionally */ - ctx->ubo_mask = ~0; - return; - } + if (ctx->inputs->no_ubo_to_push) { + /* If nothing is pushed, all UBOs need to be uploaded + * conventionally */ + ctx->ubo_mask = ~0; + return; + } - struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx); + struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx); - unsigned work_count = mir_work_heuristic(ctx, &analysis); - unsigned promoted_count = 24 - work_count; + unsigned work_count = mir_work_heuristic(ctx, &analysis); + unsigned promoted_count = 24 - work_count; - /* Ensure we are 16 byte aligned to avoid underallocations */ - mir_pick_ubo(&ctx->info->push, &analysis, promoted_count); - ctx->info->push.count = ALIGN_POT(ctx->info->push.count, 4); + /* Ensure we are 16 byte aligned to avoid underallocations */ + mir_pick_ubo(&ctx->info->push, &analysis, promoted_count); + ctx->info->push.count = ALIGN_POT(ctx->info->push.count, 4); - /* First, figure out special indices a priori so we don't recompute a lot */ - BITSET_WORD *special = mir_special_indices(ctx); + /* First, figure out special indices a priori so we don't recompute a lot */ + BITSET_WORD *special = mir_special_indices(ctx); - ctx->ubo_mask = 0; + ctx->ubo_mask = 0; - mir_foreach_instr_global_safe(ctx, ins) { - if (!mir_is_ubo(ins)) continue; + mir_foreach_instr_global_safe(ctx, ins) { + if (!mir_is_ubo(ins)) + continue; - unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store); - unsigned qword = ins->constants.u32[0] / 16; + unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store); + unsigned qword = ins->constants.u32[0] / 16; - if (!mir_is_direct_aligned_ubo(ins)) { - if (ins->src[1] == ~0) - ctx->ubo_mask |= BITSET_BIT(ubo); - else - ctx->ubo_mask = ~0; + if (!mir_is_direct_aligned_ubo(ins)) { + if (ins->src[1] == ~0) + ctx->ubo_mask |= BITSET_BIT(ubo); + else + ctx->ubo_mask = ~0; - continue; - } + continue; + } - /* Check if we decided to push this */ - assert(ubo < analysis.nr_blocks); - if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) { - ctx->ubo_mask |= BITSET_BIT(ubo); - continue; - } + /* Check if we decided to push this */ + assert(ubo < analysis.nr_blocks); + if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) { + ctx->ubo_mask |= BITSET_BIT(ubo); + continue; + } - /* Find where we pushed to, TODO: unaligned pushes to pack */ - unsigned base = pan_lookup_pushed_ubo(&ctx->info->push, ubo, qword * 16); - assert((base & 0x3) == 0); + /* Find where we pushed to, TODO: unaligned pushes to pack */ + unsigned base = pan_lookup_pushed_ubo(&ctx->info->push, ubo, qword * 16); + assert((base & 0x3) == 0); - unsigned address = base / 4; - unsigned uniform_reg = 23 - address; + unsigned address = base / 4; + unsigned uniform_reg = 23 - address; - /* Should've taken into account when pushing */ - assert(address < promoted_count); - unsigned promoted = SSA_FIXED_REGISTER(uniform_reg); + /* Should've taken into account when pushing */ + assert(address < promoted_count); + unsigned promoted = SSA_FIXED_REGISTER(uniform_reg); - /* We do need the move for safety for a non-SSA dest, or if - * we're being fed into a special class */ + /* We do need the move for safety for a non-SSA dest, or if + * we're being fed into a special class */ - bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1; + bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1; - if (ins->dest < ctx->temp_count) - needs_move |= BITSET_TEST(special, ins->dest); + if (ins->dest < ctx->temp_count) + needs_move |= BITSET_TEST(special, ins->dest); - if (needs_move) { - unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); - midgard_instruction mov = v_mov(promoted, ins->dest); - mov.dest_type = nir_type_uint | type_size; - mov.src_types[1] = mov.dest_type; + if (needs_move) { + unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); + midgard_instruction mov = v_mov(promoted, ins->dest); + mov.dest_type = nir_type_uint | type_size; + mov.src_types[1] = mov.dest_type; - uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size); - mir_set_bytemask(&mov, rounded); - mir_insert_instruction_before(ctx, ins, mov); - } else { - mir_rewrite_index_src(ctx, ins->dest, promoted); - } + uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size); + mir_set_bytemask(&mov, rounded); + mir_insert_instruction_before(ctx, ins, mov); + } else { + mir_rewrite_index_src(ctx, ins->dest, promoted); + } - mir_remove_instruction(ins); - } + mir_remove_instruction(ins); + } - free(special); - free(analysis.blocks); + free(special); + free(analysis.blocks); } diff --git a/src/panfrost/midgard/mir_squeeze.c b/src/panfrost/midgard/mir_squeeze.c index aa230f836db..6eae34ef108 100644 --- a/src/panfrost/midgard/mir_squeeze.c +++ b/src/panfrost/midgard/mir_squeeze.c @@ -31,25 +31,23 @@ static unsigned find_or_allocate_temp(compiler_context *ctx, struct hash_table_u64 *map, - unsigned hash) + unsigned hash) { - if (hash >= SSA_FIXED_MINIMUM) - return hash; + if (hash >= SSA_FIXED_MINIMUM) + return hash; - unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( - map, hash + 1); + unsigned temp = (uintptr_t)_mesa_hash_table_u64_search(map, hash + 1); - if (temp) - return temp - 1; + if (temp) + return temp - 1; - /* If no temp is find, allocate one */ - temp = ctx->temp_count++; - ctx->max_hash = MAX2(ctx->max_hash, hash); + /* If no temp is find, allocate one */ + temp = ctx->temp_count++; + ctx->max_hash = MAX2(ctx->max_hash, hash); - _mesa_hash_table_u64_insert(map, - hash + 1, (void *) ((uintptr_t) temp + 1)); + _mesa_hash_table_u64_insert(map, hash + 1, (void *)((uintptr_t)temp + 1)); - return temp; + return temp; } /* Reassigns numbering to get rid of gaps in the indices and to prioritize @@ -58,30 +56,30 @@ find_or_allocate_temp(compiler_context *ctx, struct hash_table_u64 *map, void mir_squeeze_index(compiler_context *ctx) { - struct hash_table_u64 *map = _mesa_hash_table_u64_create(NULL); + struct hash_table_u64 *map = _mesa_hash_table_u64_create(NULL); - /* Reset */ - ctx->temp_count = 0; + /* Reset */ + ctx->temp_count = 0; - /* We need to prioritize texture registers on older GPUs so we don't - * fail RA trying to assign to work registers r0/r1 when a work - * register is already there */ + /* We need to prioritize texture registers on older GPUs so we don't + * fail RA trying to assign to work registers r0/r1 when a work + * register is already there */ - mir_foreach_instr_global(ctx, ins) { - if (ins->type == TAG_TEXTURE_4) - ins->dest = find_or_allocate_temp(ctx, map, ins->dest); - } + mir_foreach_instr_global(ctx, ins) { + if (ins->type == TAG_TEXTURE_4) + ins->dest = find_or_allocate_temp(ctx, map, ins->dest); + } - mir_foreach_instr_global(ctx, ins) { - if (ins->type != TAG_TEXTURE_4) - ins->dest = find_or_allocate_temp(ctx, map, ins->dest); + mir_foreach_instr_global(ctx, ins) { + if (ins->type != TAG_TEXTURE_4) + ins->dest = find_or_allocate_temp(ctx, map, ins->dest); - for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) - ins->src[i] = find_or_allocate_temp(ctx, map, ins->src[i]); - } + for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) + ins->src[i] = find_or_allocate_temp(ctx, map, ins->src[i]); + } - ctx->blend_input = find_or_allocate_temp(ctx, map, ctx->blend_input); - ctx->blend_src1 = find_or_allocate_temp(ctx, map, ctx->blend_src1); + ctx->blend_input = find_or_allocate_temp(ctx, map, ctx->blend_input); + ctx->blend_src1 = find_or_allocate_temp(ctx, map, ctx->blend_src1); - _mesa_hash_table_u64_destroy(map); + _mesa_hash_table_u64_destroy(map); } diff --git a/src/panfrost/midgard/nir_fuse_io_16.c b/src/panfrost/midgard/nir_fuse_io_16.c index b97129f1c33..f4b052ea438 100644 --- a/src/panfrost/midgard/nir_fuse_io_16.c +++ b/src/panfrost/midgard/nir_fuse_io_16.c @@ -50,22 +50,24 @@ nir_fuse_io_16(nir_shader *shader) bool progress = false; nir_foreach_function(function, shader) { - if (!function->impl) continue; + if (!function->impl) + continue; nir_builder b; nir_builder_init(&b, function->impl); nir_foreach_block(block, function->impl) { nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) continue; + if (instr->type != nir_instr_type_intrinsic) + continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_load_interpolated_input) - continue; + continue; if (nir_dest_bit_size(intr->dest) != 32) - continue; + continue; /* We swizzle at a 32-bit level so need a multiple of 2. We could * do a bit better and handle even components though */ @@ -101,8 +103,8 @@ nir_fuse_io_16(nir_shader *shader) } } - nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); - + nir_metadata_preserve(function->impl, + nir_metadata_block_index | nir_metadata_dominance); } return progress; diff --git a/src/panfrost/perf/pan_perf.c b/src/panfrost/perf/pan_perf.c index c543d7f0dbb..c35d0f87c02 100644 --- a/src/panfrost/perf/pan_perf.c +++ b/src/panfrost/perf/pan_perf.c @@ -23,12 +23,12 @@ #include "pan_perf.h" -#include -#include #include +#include +#include #define PAN_COUNTERS_PER_CATEGORY 64 -#define PAN_SHADER_CORE_INDEX 3 +#define PAN_SHADER_CORE_INDEX 3 uint32_t panfrost_perf_counter_read(const struct panfrost_perf_counter *counter, @@ -53,12 +53,12 @@ panfrost_perf_counter_read(const struct panfrost_perf_counter *counter, static const struct panfrost_perf_config * panfrost_lookup_counters(const char *name) { - for (unsigned i = 0; i < ARRAY_SIZE(panfrost_perf_configs); ++i) { - if (strcmp(panfrost_perf_configs[i]->name, name) == 0) - return panfrost_perf_configs[i]; - } + for (unsigned i = 0; i < ARRAY_SIZE(panfrost_perf_configs); ++i) { + if (strcmp(panfrost_perf_configs[i]->name, name) == 0) + return panfrost_perf_configs[i]; + } - return NULL; + return NULL; } void @@ -67,12 +67,12 @@ panfrost_perf_init(struct panfrost_perf *perf, struct panfrost_device *dev) perf->dev = dev; if (dev->model == NULL) - unreachable("Invalid GPU ID"); + unreachable("Invalid GPU ID"); perf->cfg = panfrost_lookup_counters(dev->model->performance_counters); if (perf->cfg == NULL) - unreachable("Performance counters missing!"); + unreachable("Performance counters missing!"); // Generally counter blocks are laid out in the following order: // Job manager, tiler, one or more L2 caches, and one or more shader cores. @@ -92,7 +92,8 @@ static int panfrost_perf_query(struct panfrost_perf *perf, uint32_t enable) { struct drm_panfrost_perfcnt_enable perfcnt_enable = {enable, 0}; - return drmIoctl(perf->dev->fd, DRM_IOCTL_PANFROST_PERFCNT_ENABLE, &perfcnt_enable); + return drmIoctl(perf->dev->fd, DRM_IOCTL_PANFROST_PERFCNT_ENABLE, + &perfcnt_enable); } int @@ -110,7 +111,10 @@ panfrost_perf_disable(struct panfrost_perf *perf) int panfrost_perf_dump(struct panfrost_perf *perf) { - // Dump performance counter values to the memory buffer pointed to by counter_values - struct drm_panfrost_perfcnt_dump perfcnt_dump = {(uint64_t)(uintptr_t)perf->counter_values}; - return drmIoctl(perf->dev->fd, DRM_IOCTL_PANFROST_PERFCNT_DUMP, &perfcnt_dump); + // Dump performance counter values to the memory buffer pointed to by + // counter_values + struct drm_panfrost_perfcnt_dump perfcnt_dump = { + (uint64_t)(uintptr_t)perf->counter_values}; + return drmIoctl(perf->dev->fd, DRM_IOCTL_PANFROST_PERFCNT_DUMP, + &perfcnt_dump); } diff --git a/src/panfrost/perf/pan_perf.h b/src/panfrost/perf/pan_perf.h index 592dce71cea..4a40f2cc393 100644 --- a/src/panfrost/perf/pan_perf.h +++ b/src/panfrost/perf/pan_perf.h @@ -31,7 +31,7 @@ extern "C" { #endif #define PAN_PERF_MAX_CATEGORIES 4 -#define PAN_PERF_MAX_COUNTERS 64 +#define PAN_PERF_MAX_COUNTERS 64 struct panfrost_device; struct panfrost_perf_category; @@ -85,7 +85,7 @@ struct panfrost_perf_config { struct panfrost_perf { struct panfrost_device *dev; - const struct panfrost_perf_config* cfg; + const struct panfrost_perf_config *cfg; // Memory where to dump counter values uint32_t *counter_values; @@ -95,21 +95,17 @@ struct panfrost_perf { unsigned category_offset[PAN_PERF_MAX_CATEGORIES]; }; -uint32_t -panfrost_perf_counter_read(const struct panfrost_perf_counter *counter, - const struct panfrost_perf *perf); +uint32_t panfrost_perf_counter_read(const struct panfrost_perf_counter *counter, + const struct panfrost_perf *perf); -void -panfrost_perf_init(struct panfrost_perf *perf, struct panfrost_device *dev); +void panfrost_perf_init(struct panfrost_perf *perf, + struct panfrost_device *dev); -int -panfrost_perf_enable(struct panfrost_perf *perf); +int panfrost_perf_enable(struct panfrost_perf *perf); -int -panfrost_perf_disable(struct panfrost_perf *perf); +int panfrost_perf_disable(struct panfrost_perf *perf); -int -panfrost_perf_dump(struct panfrost_perf *perf); +int panfrost_perf_dump(struct panfrost_perf *perf); #if defined(__cplusplus) } // extern "C" diff --git a/src/panfrost/perf/quick.c b/src/panfrost/perf/quick.c index 286c7263f72..56513322306 100644 --- a/src/panfrost/perf/quick.c +++ b/src/panfrost/perf/quick.c @@ -2,51 +2,55 @@ #include #include "pan_perf.h" -int main(void) { - int fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER); +int +main(void) +{ + int fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER); - if (fd < 0) { - fprintf(stderr, "No panfrost device\n"); - exit(1); - } + if (fd < 0) { + fprintf(stderr, "No panfrost device\n"); + exit(1); + } - void *ctx = ralloc_context(NULL); - struct panfrost_perf *perf = rzalloc(ctx, struct panfrost_perf); + void *ctx = ralloc_context(NULL); + struct panfrost_perf *perf = rzalloc(ctx, struct panfrost_perf); - struct panfrost_device dev = {}; - panfrost_open_device(ctx, fd, &dev); + struct panfrost_device dev = {}; + panfrost_open_device(ctx, fd, &dev); - panfrost_perf_init(perf, &dev); - int ret = panfrost_perf_enable(perf); - - if (ret < 0) { - fprintf(stderr, "failed to enable counters (%d)\n", ret); - fprintf(stderr, "try `# echo Y > /sys/module/panfrost/parameters/unstable_ioctls`\n"); + panfrost_perf_init(perf, &dev); + int ret = panfrost_perf_enable(perf); - exit(1); - } + if (ret < 0) { + fprintf(stderr, "failed to enable counters (%d)\n", ret); + fprintf( + stderr, + "try `# echo Y > /sys/module/panfrost/parameters/unstable_ioctls`\n"); - sleep(1); + exit(1); + } - panfrost_perf_dump(perf); + sleep(1); - for (unsigned i = 0; i < perf->cfg->n_categories; ++i) { - const struct panfrost_perf_category *cat = &perf->cfg->categories[i]; - printf("%s\n", cat->name); + panfrost_perf_dump(perf); - for (unsigned j = 0; j < cat->n_counters; ++j) { - const struct panfrost_perf_counter *ctr = &cat->counters[j]; - uint32_t val = panfrost_perf_counter_read(ctr, perf); - printf("%s (%s): %u\n", ctr->name, ctr->symbol_name, val); - } + for (unsigned i = 0; i < perf->cfg->n_categories; ++i) { + const struct panfrost_perf_category *cat = &perf->cfg->categories[i]; + printf("%s\n", cat->name); - printf("\n"); - } + for (unsigned j = 0; j < cat->n_counters; ++j) { + const struct panfrost_perf_counter *ctr = &cat->counters[j]; + uint32_t val = panfrost_perf_counter_read(ctr, perf); + printf("%s (%s): %u\n", ctr->name, ctr->symbol_name, val); + } - if (panfrost_perf_disable(perf) < 0) { - fprintf(stderr, "failed to disable counters\n"); - exit(1); - } + printf("\n"); + } - panfrost_close_device(&dev); + if (panfrost_perf_disable(perf) < 0) { + fprintf(stderr, "failed to disable counters\n"); + exit(1); + } + + panfrost_close_device(&dev); } diff --git a/src/panfrost/shared/pan_minmax_cache.c b/src/panfrost/shared/pan_minmax_cache.c index be6f173893a..1da7ee51e1e 100644 --- a/src/panfrost/shared/pan_minmax_cache.c +++ b/src/panfrost/shared/pan_minmax_cache.c @@ -41,50 +41,51 @@ #include "pan_minmax_cache.h" bool -panfrost_minmax_cache_get(struct panfrost_minmax_cache *cache, unsigned start, unsigned count, - unsigned *min_index, unsigned *max_index) +panfrost_minmax_cache_get(struct panfrost_minmax_cache *cache, unsigned start, + unsigned count, unsigned *min_index, + unsigned *max_index) { - uint64_t ht_key = (((uint64_t)count) << 32) | start; - bool found = false; + uint64_t ht_key = (((uint64_t)count) << 32) | start; + bool found = false; - if (!cache) - return false; + if (!cache) + return false; - for (unsigned i = 0; i < cache->size; ++i) { - if (cache->keys[i] == ht_key) { - uint64_t hit = cache->values[i]; + for (unsigned i = 0; i < cache->size; ++i) { + if (cache->keys[i] == ht_key) { + uint64_t hit = cache->values[i]; - *min_index = hit & 0xffffffff; - *max_index = hit >> 32; - found = true; - break; - } - } + *min_index = hit & 0xffffffff; + *max_index = hit >> 32; + found = true; + break; + } + } - return found; + return found; } void -panfrost_minmax_cache_add(struct panfrost_minmax_cache *cache, unsigned start, unsigned count, - unsigned min_index, unsigned max_index) +panfrost_minmax_cache_add(struct panfrost_minmax_cache *cache, unsigned start, + unsigned count, unsigned min_index, + unsigned max_index) { - uint64_t ht_key = (((uint64_t)count) << 32) | start; - uint64_t value = min_index | (((uint64_t)max_index) << 32); - unsigned index = 0; + uint64_t ht_key = (((uint64_t)count) << 32) | start; + uint64_t value = min_index | (((uint64_t)max_index) << 32); + unsigned index = 0; - if (!cache) - return; + if (!cache) + return; - if (cache->size == PANFROST_MINMAX_SIZE) { - index = cache->index++; - cache->index = cache->index % PANFROST_MINMAX_SIZE; - } else { - index = cache->size++; - } - - cache->keys[index] = ht_key; - cache->values[index] = value; + if (cache->size == PANFROST_MINMAX_SIZE) { + index = cache->index++; + cache->index = cache->index % PANFROST_MINMAX_SIZE; + } else { + index = cache->size++; + } + cache->keys[index] = ht_key; + cache->values[index] = value; } /* If we've been caching min/max indices and we update the index @@ -92,32 +93,34 @@ panfrost_minmax_cache_add(struct panfrost_minmax_cache *cache, unsigned start, u * what we've written, and throw out invalid entries. */ void -panfrost_minmax_cache_invalidate(struct panfrost_minmax_cache *cache, struct pipe_transfer *transfer) +panfrost_minmax_cache_invalidate(struct panfrost_minmax_cache *cache, + struct pipe_transfer *transfer) { - /* Ensure there is a cache to invalidate and a write */ - if (!cache) - return; + /* Ensure there is a cache to invalidate and a write */ + if (!cache) + return; - if (!(transfer->usage & PIPE_MAP_WRITE)) - return; + if (!(transfer->usage & PIPE_MAP_WRITE)) + return; - unsigned valid_count = 0; + unsigned valid_count = 0; - for (unsigned i = 0; i < cache->size; ++i) { - uint64_t key = cache->keys[i]; + for (unsigned i = 0; i < cache->size; ++i) { + uint64_t key = cache->keys[i]; - uint32_t start = key & 0xffffffff; - uint32_t count = key >> 32; + uint32_t start = key & 0xffffffff; + uint32_t count = key >> 32; - /* 1D range intersection */ - bool invalid = MAX2(transfer->box.x, start) < MIN2(transfer->box.x + transfer->box.width, start + count); - if (!invalid) { - cache->keys[valid_count] = key; - cache->values[valid_count] = cache->values[i]; - valid_count++; - } - } + /* 1D range intersection */ + bool invalid = MAX2(transfer->box.x, start) < + MIN2(transfer->box.x + transfer->box.width, start + count); + if (!invalid) { + cache->keys[valid_count] = key; + cache->values[valid_count] = cache->values[i]; + valid_count++; + } + } - cache->size = valid_count; - cache->index = 0; + cache->size = valid_count; + cache->index = 0; } diff --git a/src/panfrost/shared/pan_minmax_cache.h b/src/panfrost/shared/pan_minmax_cache.h index fe264370ef3..651e6436cd6 100644 --- a/src/panfrost/shared/pan_minmax_cache.h +++ b/src/panfrost/shared/pan_minmax_cache.h @@ -32,21 +32,21 @@ #define PANFROST_MINMAX_SIZE 64 struct panfrost_minmax_cache { - uint64_t keys[PANFROST_MINMAX_SIZE]; - uint64_t values[PANFROST_MINMAX_SIZE]; - unsigned size; - unsigned index; + uint64_t keys[PANFROST_MINMAX_SIZE]; + uint64_t values[PANFROST_MINMAX_SIZE]; + unsigned size; + unsigned index; }; -bool -panfrost_minmax_cache_get(struct panfrost_minmax_cache *cache, unsigned start, unsigned count, - unsigned *min_index, unsigned *max_index); +bool panfrost_minmax_cache_get(struct panfrost_minmax_cache *cache, + unsigned start, unsigned count, + unsigned *min_index, unsigned *max_index); -void -panfrost_minmax_cache_add(struct panfrost_minmax_cache *cache, unsigned start, unsigned count, - unsigned min_index, unsigned max_index); +void panfrost_minmax_cache_add(struct panfrost_minmax_cache *cache, + unsigned start, unsigned count, + unsigned min_index, unsigned max_index); -void -panfrost_minmax_cache_invalidate(struct panfrost_minmax_cache *cache, struct pipe_transfer *transfer); +void panfrost_minmax_cache_invalidate(struct panfrost_minmax_cache *cache, + struct pipe_transfer *transfer); #endif diff --git a/src/panfrost/shared/pan_tiling.c b/src/panfrost/shared/pan_tiling.c index 40007c5c67b..d49c69d48ce 100644 --- a/src/panfrost/shared/pan_tiling.c +++ b/src/panfrost/shared/pan_tiling.c @@ -27,8 +27,8 @@ #include "pan_tiling.h" #include -#include "util/macros.h" #include "util/bitscan.h" +#include "util/macros.h" /* * This file implements software encode/decode of u-interleaved textures. @@ -105,8 +105,8 @@ const unsigned space_4[16] = { /* The scheme uses 16x16 tiles */ -#define TILE_WIDTH 16 -#define TILE_HEIGHT 16 +#define TILE_WIDTH 16 +#define TILE_HEIGHT 16 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT) /* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must @@ -118,24 +118,24 @@ const unsigned space_4[16] = { typedef __uint128_t pan_uint128_t; #else typedef struct { - uint64_t lo; - uint64_t hi; + uint64_t lo; + uint64_t hi; } __attribute__((packed)) pan_uint128_t; #endif typedef struct { - uint16_t lo; - uint8_t hi; + uint16_t lo; + uint8_t hi; } __attribute__((packed)) pan_uint24_t; typedef struct { - uint32_t lo; - uint16_t hi; + uint32_t lo; + uint16_t hi; } __attribute__((packed)) pan_uint48_t; typedef struct { - uint64_t lo; - uint32_t hi; + uint64_t lo; + uint32_t hi; } __attribute__((packed)) pan_uint96_t; /* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation: @@ -169,33 +169,29 @@ typedef struct { * be unrolled), calculating the index within the tile and writing. */ -#define TILED_ACCESS_TYPE(pixel_t, shift) \ -static ALWAYS_INLINE void \ -panfrost_access_tiled_image_##pixel_t \ - (void *dst, void *src, \ - uint16_t sx, uint16_t sy, \ - uint16_t w, uint16_t h, \ - uint32_t dst_stride, \ - uint32_t src_stride, \ - bool is_store) \ -{ \ - uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \ - for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ - uint8_t *dest = (uint8_t *) (dest_start + ((y >> 4) * dst_stride)); \ - pixel_t *source = src + (src_y * src_stride); \ - pixel_t *source_end = source + w; \ - unsigned expanded_y = bit_duplication[y & 0xF] << shift; \ - for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \ - for (uint8_t i = 0; i < 16; ++i) { \ - unsigned index = expanded_y ^ (space_4[i] << shift); \ - if (is_store) \ - *((pixel_t *) (dest + index)) = *(source++); \ - else \ - *(source++) = *((pixel_t *) (dest + index)); \ - } \ - } \ - } \ -} \ +#define TILED_ACCESS_TYPE(pixel_t, shift) \ + static ALWAYS_INLINE void panfrost_access_tiled_image_##pixel_t( \ + void *dst, void *src, uint16_t sx, uint16_t sy, uint16_t w, uint16_t h, \ + uint32_t dst_stride, uint32_t src_stride, bool is_store) \ + { \ + uint8_t *dest_start = \ + dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \ + for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ + uint8_t *dest = (uint8_t *)(dest_start + ((y >> 4) * dst_stride)); \ + pixel_t *source = src + (src_y * src_stride); \ + pixel_t *source_end = source + w; \ + unsigned expanded_y = bit_duplication[y & 0xF] << shift; \ + for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \ + for (uint8_t i = 0; i < 16; ++i) { \ + unsigned index = expanded_y ^ (space_4[i] << shift); \ + if (is_store) \ + *((pixel_t *)(dest + index)) = *(source++); \ + else \ + *(source++) = *((pixel_t *)(dest + index)); \ + } \ + } \ + } \ + } TILED_ACCESS_TYPE(uint8_t, 0); TILED_ACCESS_TYPE(uint16_t, 1); @@ -203,44 +199,47 @@ TILED_ACCESS_TYPE(uint32_t, 2); TILED_ACCESS_TYPE(uint64_t, 3); TILED_ACCESS_TYPE(pan_uint128_t, 4); -#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \ - const unsigned mask = (1 << tile_shift) - 1; \ - for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ - unsigned block_start_s = (y >> tile_shift) * dst_stride; \ - unsigned source_start = src_y * src_stride; \ - unsigned expanded_y = bit_duplication[y & mask]; \ - \ - for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \ - unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \ - unsigned index = expanded_y ^ space_4[x & mask]; \ - uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \ - uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \ - \ - pixel_t *outp = (pixel_t *) (is_store ? dest : source); \ - pixel_t *inp = (pixel_t *) (is_store ? source : dest); \ - *outp = *inp; \ - } \ - } \ -} +#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) \ + { \ + const unsigned mask = (1 << tile_shift) - 1; \ + for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ + unsigned block_start_s = (y >> tile_shift) * dst_stride; \ + unsigned source_start = src_y * src_stride; \ + unsigned expanded_y = bit_duplication[y & mask]; \ + \ + for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \ + unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \ + unsigned index = expanded_y ^ space_4[x & mask]; \ + uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \ + uint8_t *dest = \ + dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \ + \ + pixel_t *outp = (pixel_t *)(is_store ? dest : source); \ + pixel_t *inp = (pixel_t *)(is_store ? source : dest); \ + *outp = *inp; \ + } \ + } \ + } -#define TILED_UNALIGNED_TYPES(store, shift) { \ - if (bpp == 8) \ - TILED_UNALIGNED_TYPE(uint8_t, store, shift) \ - else if (bpp == 16) \ - TILED_UNALIGNED_TYPE(uint16_t, store, shift) \ - else if (bpp == 24) \ - TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \ - else if (bpp == 32) \ - TILED_UNALIGNED_TYPE(uint32_t, store, shift) \ - else if (bpp == 48) \ - TILED_UNALIGNED_TYPE(pan_uint48_t, store, shift) \ - else if (bpp == 64) \ - TILED_UNALIGNED_TYPE(uint64_t, store, shift) \ - else if (bpp == 96) \ - TILED_UNALIGNED_TYPE(pan_uint96_t, store, shift) \ - else if (bpp == 128) \ - TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \ -} +#define TILED_UNALIGNED_TYPES(store, shift) \ + { \ + if (bpp == 8) \ + TILED_UNALIGNED_TYPE(uint8_t, store, shift) \ + else if (bpp == 16) \ + TILED_UNALIGNED_TYPE(uint16_t, store, shift) \ + else if (bpp == 24) \ + TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \ + else if (bpp == 32) \ + TILED_UNALIGNED_TYPE(uint32_t, store, shift) \ + else if (bpp == 48) \ + TILED_UNALIGNED_TYPE(pan_uint48_t, store, shift) \ + else if (bpp == 64) \ + TILED_UNALIGNED_TYPE(uint64_t, store, shift) \ + else if (bpp == 96) \ + TILED_UNALIGNED_TYPE(pan_uint96_t, store, shift) \ + else if (bpp == 128) \ + TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \ + } /* * Perform a generic access to a tiled image with a given format. This works @@ -249,13 +248,11 @@ TILED_ACCESS_TYPE(pan_uint128_t, 4); * so we divide here. Alignment is assumed. */ static void -panfrost_access_tiled_image_generic(void *dst, void *src, - unsigned sx, unsigned sy, - unsigned w, unsigned h, - uint32_t dst_stride, - uint32_t src_stride, - const struct util_format_description *desc, - bool _is_store) +panfrost_access_tiled_image_generic(void *dst, void *src, unsigned sx, + unsigned sy, unsigned w, unsigned h, + uint32_t dst_stride, uint32_t src_stride, + const struct util_format_description *desc, + bool _is_store) { unsigned bpp = desc->block.bits; @@ -278,16 +275,15 @@ panfrost_access_tiled_image_generic(void *dst, void *src, } } -#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8))) +#define OFFSET(src, _x, _y) \ + (void *)((uint8_t *)src + ((_y)-orig_y) * src_stride + \ + (((_x)-orig_x) * (bpp / 8))) static ALWAYS_INLINE void -panfrost_access_tiled_image(void *dst, void *src, - unsigned x, unsigned y, - unsigned w, unsigned h, - uint32_t dst_stride, - uint32_t src_stride, - enum pipe_format format, - bool is_store) +panfrost_access_tiled_image(void *dst, void *src, unsigned x, unsigned y, + unsigned w, unsigned h, uint32_t dst_stride, + uint32_t src_stride, enum pipe_format format, + bool is_store) { const struct util_format_description *desc = util_format_description(format); unsigned bpp = desc->block.bits; @@ -300,10 +296,10 @@ panfrost_access_tiled_image(void *dst, void *src, assert((dst_stride % (bpp / 8)) == 0 && "unaligned destination stride"); assert((src_stride % (bpp / 8)) == 0 && "unaligned source stride"); - if (desc->block.width > 1 || !util_is_power_of_two_nonzero(desc->block.bits)) { - panfrost_access_tiled_image_generic(dst, (void *) src, - x, y, w, h, - dst_stride, src_stride, desc, is_store); + if (desc->block.width > 1 || + !util_is_power_of_two_nonzero(desc->block.bits)) { + panfrost_access_tiled_image_generic( + dst, (void *)src, x, y, w, h, dst_stride, src_stride, desc, is_store); return; } @@ -320,9 +316,9 @@ panfrost_access_tiled_image(void *dst, void *src, if (first_full_tile_y != y) { unsigned dist = MIN2(first_full_tile_y - y, h); - panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), - x, y, w, dist, - dst_stride, src_stride, desc, is_store); + panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), x, y, w, dist, + dst_stride, src_stride, desc, + is_store); if (dist == h) return; @@ -335,9 +331,9 @@ panfrost_access_tiled_image(void *dst, void *src, if (last_full_tile_y != (y + h)) { unsigned dist = (y + h) - last_full_tile_y; - panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y), - x, last_full_tile_y, w, dist, - dst_stride, src_stride, desc, is_store); + panfrost_access_tiled_image_generic( + dst, OFFSET(src, x, last_full_tile_y), x, last_full_tile_y, w, dist, + dst_stride, src_stride, desc, is_store); h -= dist; } @@ -346,9 +342,9 @@ panfrost_access_tiled_image(void *dst, void *src, if (first_full_tile_x != x) { unsigned dist = MIN2(first_full_tile_x - x, w); - panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), - x, y, dist, h, - dst_stride, src_stride, desc, is_store); + panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), x, y, dist, h, + dst_stride, src_stride, desc, + is_store); if (dist == w) return; @@ -361,23 +357,28 @@ panfrost_access_tiled_image(void *dst, void *src, if (last_full_tile_x != (x + w)) { unsigned dist = (x + w) - last_full_tile_x; - panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y), - last_full_tile_x, y, dist, h, - dst_stride, src_stride, desc, is_store); + panfrost_access_tiled_image_generic( + dst, OFFSET(src, last_full_tile_x, y), last_full_tile_x, y, dist, h, + dst_stride, src_stride, desc, is_store); w -= dist; } if (bpp == 8) - panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); + panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, + dst_stride, src_stride, is_store); else if (bpp == 16) - panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); + panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, + dst_stride, src_stride, is_store); else if (bpp == 32) - panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); + panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, + dst_stride, src_stride, is_store); else if (bpp == 64) - panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); + panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, + dst_stride, src_stride, is_store); else if (bpp == 128) - panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); + panfrost_access_tiled_image_pan_uint128_t( + dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); } /** @@ -386,27 +387,19 @@ panfrost_access_tiled_image(void *dst, void *src, * are aligned to the block size. */ void -panfrost_store_tiled_image(void *dst, const void *src, - unsigned x, unsigned y, - unsigned w, unsigned h, - uint32_t dst_stride, - uint32_t src_stride, - enum pipe_format format) +panfrost_store_tiled_image(void *dst, const void *src, unsigned x, unsigned y, + unsigned w, unsigned h, uint32_t dst_stride, + uint32_t src_stride, enum pipe_format format) { - panfrost_access_tiled_image(dst, (void *) src, - x, y, w, h, - dst_stride, src_stride, format, true); + panfrost_access_tiled_image(dst, (void *)src, x, y, w, h, dst_stride, + src_stride, format, true); } void -panfrost_load_tiled_image(void *dst, const void *src, - unsigned x, unsigned y, - unsigned w, unsigned h, - uint32_t dst_stride, - uint32_t src_stride, - enum pipe_format format) +panfrost_load_tiled_image(void *dst, const void *src, unsigned x, unsigned y, + unsigned w, unsigned h, uint32_t dst_stride, + uint32_t src_stride, enum pipe_format format) { - panfrost_access_tiled_image((void *) src, dst, - x, y, w, h, - src_stride, dst_stride, format, false); + panfrost_access_tiled_image((void *)src, dst, x, y, w, h, src_stride, + dst_stride, format, false); } diff --git a/src/panfrost/shared/pan_tiling.h b/src/panfrost/shared/pan_tiling.h index d63c581edef..c8734057bc5 100644 --- a/src/panfrost/shared/pan_tiling.h +++ b/src/panfrost/shared/pan_tiling.h @@ -47,11 +47,9 @@ extern "C" { * @src_stride Number of bytes between adjacent rows of tiles in source. * @format Format of the source and destination image */ -void panfrost_load_tiled_image(void *dst, const void *src, - unsigned x, unsigned y, - unsigned w, unsigned h, - uint32_t dst_stride, - uint32_t src_stride, +void panfrost_load_tiled_image(void *dst, const void *src, unsigned x, + unsigned y, unsigned w, unsigned h, + uint32_t dst_stride, uint32_t src_stride, enum pipe_format format); /** @@ -67,14 +65,11 @@ void panfrost_load_tiled_image(void *dst, const void *src, * @src_stride Stride in bytes of linear source * @format Format of the source and destination image */ -void panfrost_store_tiled_image(void *dst, const void *src, - unsigned x, unsigned y, - unsigned w, unsigned h, - uint32_t dst_stride, - uint32_t src_stride, +void panfrost_store_tiled_image(void *dst, const void *src, unsigned x, + unsigned y, unsigned w, unsigned h, + uint32_t dst_stride, uint32_t src_stride, enum pipe_format format); - #ifdef __cplusplus } /* extern C */ #endif diff --git a/src/panfrost/shared/test/test-tiling.cpp b/src/panfrost/shared/test/test-tiling.cpp index d5ad9e31c7f..8ee4fbf2172 100644 --- a/src/panfrost/shared/test/test-tiling.cpp +++ b/src/panfrost/shared/test/test-tiling.cpp @@ -45,13 +45,14 @@ u_order(unsigned x, unsigned y) unsigned y2 = (y & 4) ? 1 : 0; unsigned y3 = (y & 8) ? 1 : 0; - return (xy0 << 0) | (y0 << 1) | (xy1 << 2) | (y1 << 3) | - (xy2 << 4) | (y2 << 5) | (xy3 << 6) | (y3 << 7); + return (xy0 << 0) | (y0 << 1) | (xy1 << 2) | (y1 << 3) | (xy2 << 4) | + (y2 << 5) | (xy3 << 6) | (y3 << 7); } /* x/y are in blocks */ static unsigned -tiled_offset(unsigned x, unsigned y, unsigned stride, unsigned tilesize, unsigned blocksize) +tiled_offset(unsigned x, unsigned y, unsigned stride, unsigned tilesize, + unsigned blocksize) { unsigned tile_x = x / tilesize; unsigned tile_y = y / tilesize; @@ -75,15 +76,13 @@ linear_offset(unsigned x, unsigned y, unsigned stride, unsigned blocksize) } static void -ref_access_tiled(void *dst, const void *src, - unsigned region_x, unsigned region_y, - unsigned w, unsigned h, - uint32_t dst_stride, - uint32_t src_stride, - enum pipe_format format, +ref_access_tiled(void *dst, const void *src, unsigned region_x, + unsigned region_y, unsigned w, unsigned h, uint32_t dst_stride, + uint32_t src_stride, enum pipe_format format, bool dst_is_tiled) { - const struct util_format_description *desc = util_format_description(format);; + const struct util_format_description *desc = util_format_description(format); + ; unsigned tilesize = (desc->block.width > 1) ? 4 : 16; unsigned blocksize = (desc->block.bits / 8); @@ -94,8 +93,10 @@ ref_access_tiled(void *dst, const void *src, unsigned region_x_block = region_x / desc->block.width; unsigned region_y_block = region_y / desc->block.height; - for (unsigned linear_y_block = 0; linear_y_block < h_block; ++linear_y_block) { - for (unsigned linear_x_block = 0; linear_x_block < w_block; ++linear_x_block) { + for (unsigned linear_y_block = 0; linear_y_block < h_block; + ++linear_y_block) { + for (unsigned linear_x_block = 0; linear_x_block < w_block; + ++linear_x_block) { unsigned tiled_x_block = region_x_block + linear_x_block; unsigned tiled_y_block = region_y_block + linear_y_block; @@ -103,15 +104,18 @@ ref_access_tiled(void *dst, const void *src, unsigned dst_offset, src_offset; if (dst_is_tiled) { - dst_offset = tiled_offset(tiled_x_block, tiled_y_block, dst_stride, tilesize, blocksize); - src_offset = linear_offset(linear_x_block, linear_y_block, src_stride, blocksize); + dst_offset = tiled_offset(tiled_x_block, tiled_y_block, dst_stride, + tilesize, blocksize); + src_offset = linear_offset(linear_x_block, linear_y_block, + src_stride, blocksize); } else { - dst_offset = linear_offset(linear_x_block, linear_y_block, dst_stride, blocksize); - src_offset = tiled_offset(tiled_x_block, tiled_y_block, src_stride, tilesize, blocksize); + dst_offset = linear_offset(linear_x_block, linear_y_block, + dst_stride, blocksize); + src_offset = tiled_offset(tiled_x_block, tiled_y_block, src_stride, + tilesize, blocksize); } - memcpy((uint8_t *) dst + dst_offset, - (const uint8_t *) src + src_offset, + memcpy((uint8_t *)dst + dst_offset, (const uint8_t *)src + src_offset, desc->block.bits / 8); } } @@ -123,14 +127,13 @@ ref_access_tiled(void *dst, const void *src, * production. */ static void -test(unsigned width, unsigned height, unsigned rx, unsigned ry, - unsigned rw, unsigned rh, unsigned linear_stride, - enum pipe_format format, bool store) +test(unsigned width, unsigned height, unsigned rx, unsigned ry, unsigned rw, + unsigned rh, unsigned linear_stride, enum pipe_format format, bool store) { unsigned bpp = util_format_get_blocksize(format); unsigned tile_height = util_format_is_compressed(format) ? 4 : 16; - unsigned tiled_width = ALIGN_POT(width, 16); + unsigned tiled_width = ALIGN_POT(width, 16); unsigned tiled_height = ALIGN_POT(height, 16); unsigned tiled_stride = tiled_width * tile_height * bpp; @@ -139,26 +142,27 @@ test(unsigned width, unsigned height, unsigned rx, unsigned ry, void *tiled = calloc(bpp, tiled_width * tiled_height); void *linear = calloc(bpp, rw * linear_stride); - void *ref = calloc(bpp, store ? (tiled_width * tiled_height) : (rw * linear_stride)); + void *ref = + calloc(bpp, store ? (tiled_width * tiled_height) : (rw * linear_stride)); if (store) { for (unsigned i = 0; i < bpp * rw * linear_stride; ++i) { - ((uint8_t *) linear)[i] = (i & 0xFF); + ((uint8_t *)linear)[i] = (i & 0xFF); } - panfrost_store_tiled_image(tiled, linear, rx, ry, rw, rh, - dst_stride, src_stride, format); + panfrost_store_tiled_image(tiled, linear, rx, ry, rw, rh, dst_stride, + src_stride, format); } else { for (unsigned i = 0; i < bpp * tiled_width * tiled_height; ++i) { - ((uint8_t *) tiled)[i] = (i & 0xFF); + ((uint8_t *)tiled)[i] = (i & 0xFF); } - panfrost_load_tiled_image(linear, tiled, rx, ry, rw, rh, - dst_stride, src_stride, format); + panfrost_load_tiled_image(linear, tiled, rx, ry, rw, rh, dst_stride, + src_stride, format); } - ref_access_tiled(ref, store ? linear : tiled, rx, ry, rw, rh, - dst_stride, src_stride, format, store); + ref_access_tiled(ref, store ? linear : tiled, rx, ry, rw, rh, dst_stride, + src_stride, format, store); if (store) EXPECT_EQ(memcmp(ref, tiled, bpp * tiled_width * tiled_height), 0); @@ -273,7 +277,7 @@ TEST(UInterleavedTiling, ASTC) TEST(UInterleavedTiling, PartialASTC) { /* Block alignment assumed */ - test_ldst(40, 40, 4, 4, 16, 8, 512, PIPE_FORMAT_ASTC_4x4); - test_ldst(50, 40, 5, 4, 10, 8, 512, PIPE_FORMAT_ASTC_5x4); + test_ldst(40, 40, 4, 4, 16, 8, 512, PIPE_FORMAT_ASTC_4x4); + test_ldst(50, 40, 5, 4, 10, 8, 512, PIPE_FORMAT_ASTC_5x4); test_ldst(50, 50, 5, 5, 10, 10, 512, PIPE_FORMAT_ASTC_5x5); } diff --git a/src/panfrost/tools/panfrost_texfeatures.c b/src/panfrost/tools/panfrost_texfeatures.c index 67872e56697..f64a962aec0 100644 --- a/src/panfrost/tools/panfrost_texfeatures.c +++ b/src/panfrost/tools/panfrost_texfeatures.c @@ -12,36 +12,39 @@ * Malis should be similar. */ struct format { - unsigned bit; - const char *name; + unsigned bit; + const char *name; }; -#define FMT(bit, name) { bit, name ":" } +#define FMT(bit, name) \ + { \ + bit, name ":" \ + } static struct format formats[] = { - FMT( 1, "ETC2"), - FMT( 3, "ETC2 EAC"), - FMT(19, "ETC2 PTA"), - FMT( 2, "EAC 1"), - FMT( 4, "EAC 2"), - FMT(17, "EAC snorm 1"), - FMT(18, "EAC snorm 2"), - { 0, NULL }, - FMT(20, "ASTC 3D LDR"), - FMT(21, "ASTC 3D HDR"), - FMT(22, "ASTC 2D LDR"), - FMT(23, "ASTC 3D HDR"), - { 0, NULL }, - FMT( 7, "BC1"), - FMT( 8, "BC2"), - FMT( 9, "BC3"), - FMT(10, "BC4 unorm"), - FMT(11, "BC4 snorm"), - FMT(12, "BC5 unorm"), - FMT(13, "BC5 snorm"), - FMT(14, "BC6H UF16"), - FMT(15, "BC6H SF16"), - FMT(16, "BC7"), + FMT(1, "ETC2"), + FMT(3, "ETC2 EAC"), + FMT(19, "ETC2 PTA"), + FMT(2, "EAC 1"), + FMT(4, "EAC 2"), + FMT(17, "EAC snorm 1"), + FMT(18, "EAC snorm 2"), + {0, NULL}, + FMT(20, "ASTC 3D LDR"), + FMT(21, "ASTC 3D HDR"), + FMT(22, "ASTC 2D LDR"), + FMT(23, "ASTC 3D HDR"), + {0, NULL}, + FMT(7, "BC1"), + FMT(8, "BC2"), + FMT(9, "BC3"), + FMT(10, "BC4 unorm"), + FMT(11, "BC4 snorm"), + FMT(12, "BC5 unorm"), + FMT(13, "BC5 snorm"), + FMT(14, "BC6H UF16"), + FMT(15, "BC6H SF16"), + FMT(16, "BC7"), }; /* ANSI escape code */ @@ -49,44 +52,47 @@ static struct format formats[] = { #define RED(x) "\033[31m" x RESET #define GREEN(x) "\033[32m" x RESET -int main(void) { - int fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER); - if (fd < 0) { - fprintf(stderr, "No panfrost device\n"); - exit(1); - } +int +main(void) +{ + int fd = drmOpenWithType("panfrost", NULL, DRM_NODE_RENDER); + if (fd < 0) { + fprintf(stderr, "No panfrost device\n"); + exit(1); + } - void *ctx = ralloc_context(NULL); - struct panfrost_device dev = { 0 }; - panfrost_open_device(ctx, fd, &dev); + void *ctx = ralloc_context(NULL); + struct panfrost_device dev = {0}; + panfrost_open_device(ctx, fd, &dev); - uint32_t supported = dev.compressed_formats; - bool all_ok = true; + uint32_t supported = dev.compressed_formats; + bool all_ok = true; - printf("System-on-chip compressed texture support:" "\n\n"); + printf("System-on-chip compressed texture support:" + "\n\n"); - for (unsigned i = 0; i < ARRAY_SIZE(formats); ++i) { - if (formats[i].name == NULL) { - printf("\n"); - continue; - } + for (unsigned i = 0; i < ARRAY_SIZE(formats); ++i) { + if (formats[i].name == NULL) { + printf("\n"); + continue; + } - /* Maximum length for justification */ - assert(strlen(formats[i].name) <= 12); + /* Maximum length for justification */ + assert(strlen(formats[i].name) <= 12); - bool ok = (supported & BITFIELD_BIT(formats[i].bit)); - all_ok &= ok; + bool ok = (supported & BITFIELD_BIT(formats[i].bit)); + all_ok &= ok; - printf("%-14s %s\n", formats[i].name, - ok ? GREEN("YES") : RED(" NO")); - } + printf("%-14s %s\n", formats[i].name, ok ? GREEN("YES") : RED(" NO")); + } - if (!all_ok) { - printf("\n" - "This system-on-chip lacks support for some formats. This is not a driver bug.\n" - "Unsupported formats will be emulated at a performance and memory cost.\n"); - } + if (!all_ok) { + printf( + "\n" + "This system-on-chip lacks support for some formats. This is not a driver bug.\n" + "Unsupported formats will be emulated at a performance and memory cost.\n"); + } - panfrost_close_device(&dev); - ralloc_free(ctx); + panfrost_close_device(&dev); + ralloc_free(ctx); } diff --git a/src/panfrost/tools/panfrostdump.c b/src/panfrost/tools/panfrostdump.c index b61db132912..d204866573a 100644 --- a/src/panfrost/tools/panfrostdump.c +++ b/src/panfrost/tools/panfrostdump.c @@ -37,16 +37,16 @@ * or times out after 5min) */ -#include -#include -#include -#include -#include -#include -#include #include +#include #include #include +#include +#include +#include +#include +#include +#include #include @@ -81,7 +81,7 @@ struct panfrost_dump_object_header_ho { }; #define MAX_BODUMP_FILENAME 32 -#define GPU_PAGE_SIZE 4096 +#define GPU_PAGE_SIZE 4096 static bool read_header(FILE *fp, struct panfrost_dump_object_header_ho *pdoh) @@ -109,7 +109,7 @@ read_header(FILE *fp, struct panfrost_dump_object_header_ho *pdoh) pdoh->file_offset = le32toh(doh_le.file_offset); pdoh->file_size = le32toh(doh_le.file_size); - switch(pdoh->type) { + switch (pdoh->type) { case PANFROSTDUMP_BUF_REG: pdoh->reghdr.jc = le64toh(doh_le.reghdr.jc); pdoh->reghdr.gpu_id = le32toh(doh_le.reghdr.gpu_id); @@ -234,7 +234,7 @@ main(int argc, char *argv[]) /* clang-format on */ while ((c = getopt_long(argc, argv, "arh", longopts, NULL)) != -1) { - switch(c) { + switch (c) { case 'h': print_help(argv[0], stderr); return EXIT_SUCCESS; @@ -326,7 +326,8 @@ main(int argc, char *argv[]) } if (print_addr) { - printf("BO(%u) VA(%"PRIX64") SZ(%"PRIX32") page addresses:\n", + printf("BO(%u) VA(%" PRIX64 ") SZ(%" PRIX32 + ") page addresses:\n", j, doh.bomap.iova, doh.file_size); for (k = 0; k < (doh.file_size / GPU_PAGE_SIZE); k++) { @@ -343,8 +344,7 @@ main(int argc, char *argv[]) char bodump_filename[MAX_BODUMP_FILENAME]; FILE *bodump; - snprintf(bodump_filename, MAX_BODUMP_FILENAME, - "bodump-%u.dump", j); + snprintf(bodump_filename, MAX_BODUMP_FILENAME, "bodump-%u.dump", j); if ((bodump = fopen(bodump_filename, "wb"))) { if (fseek(data_fp, doh.file_offset, SEEK_SET)) { @@ -367,16 +367,14 @@ main(int argc, char *argv[]) } nbytes = fwrite(bos[j], 1, doh.file_size, bodump); if (nbytes < doh.file_size) { - fprintf(stderr, - "Failed to write BO contents into file: %u\n", + fprintf(stderr, "Failed to write BO contents into file: %u\n", errno); return EXIT_FAILURE; } fclose(bodump); - pandecode_inject_mmap(doh.bomap.iova, - bos[j],doh.file_size, + pandecode_inject_mmap(doh.bomap.iova, bos[j], doh.file_size, NULL); } else { diff --git a/src/panfrost/util/lcra.c b/src/panfrost/util/lcra.c index 45cff37e61f..00585c646a6 100644 --- a/src/panfrost/util/lcra.c +++ b/src/panfrost/util/lcra.c @@ -24,14 +24,14 @@ * Alyssa Rosenzweig */ -#include +#include "lcra.h" #include +#include +#include #include #include -#include #include "util/macros.h" #include "util/u_math.h" -#include "lcra.h" /* This module is the reference implementation of "Linearly Constrained * Register Allocation". The paper is available in PDF form @@ -40,161 +40,168 @@ */ struct lcra_state * -lcra_alloc_equations( - unsigned node_count, unsigned class_count) +lcra_alloc_equations(unsigned node_count, unsigned class_count) { - struct lcra_state *l = calloc(1, sizeof(*l)); + struct lcra_state *l = calloc(1, sizeof(*l)); - l->node_count = node_count; - l->class_count = class_count; + l->node_count = node_count; + l->class_count = class_count; - l->alignment = calloc(sizeof(l->alignment[0]), node_count); - l->linear = calloc(sizeof(l->linear[0]), node_count * node_count); - l->modulus = calloc(sizeof(l->modulus[0]), node_count); - l->class = calloc(sizeof(l->class[0]), node_count); - l->class_start = calloc(sizeof(l->class_start[0]), class_count); - l->class_disjoint = calloc(sizeof(l->class_disjoint[0]), class_count * class_count); - l->class_size = calloc(sizeof(l->class_size[0]), class_count); - l->spill_cost = calloc(sizeof(l->spill_cost[0]), node_count); - l->solutions = calloc(sizeof(l->solutions[0]), node_count); + l->alignment = calloc(sizeof(l->alignment[0]), node_count); + l->linear = calloc(sizeof(l->linear[0]), node_count * node_count); + l->modulus = calloc(sizeof(l->modulus[0]), node_count); + l->class = calloc(sizeof(l->class[0]), node_count); + l->class_start = calloc(sizeof(l->class_start[0]), class_count); + l->class_disjoint = + calloc(sizeof(l->class_disjoint[0]), class_count * class_count); + l->class_size = calloc(sizeof(l->class_size[0]), class_count); + l->spill_cost = calloc(sizeof(l->spill_cost[0]), node_count); + l->solutions = calloc(sizeof(l->solutions[0]), node_count); - memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); + memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); - return l; + return l; } void lcra_free(struct lcra_state *l) { - if (!l) - return; + if (!l) + return; - free(l->alignment); - free(l->linear); - free(l->modulus); - free(l->class); - free(l->class_start); - free(l->class_disjoint); - free(l->class_size); - free(l->spill_cost); - free(l->solutions); + free(l->alignment); + free(l->linear); + free(l->modulus); + free(l->class); + free(l->class_start); + free(l->class_disjoint); + free(l->class_size); + free(l->spill_cost); + free(l->solutions); - free(l); + free(l); } void -lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2, unsigned bound) +lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2, + unsigned bound) { - l->alignment[node] = (align_log2 + 1) | (bound << 16); + l->alignment[node] = (align_log2 + 1) | (bound << 16); } void lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2) { - l->class_disjoint[(c1 * l->class_count) + c2] = true; - l->class_disjoint[(c2 * l->class_count) + c1] = true; + l->class_disjoint[(c1 * l->class_count) + c2] = true; + l->class_disjoint[(c2 * l->class_count) + c1] = true; } void lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len) { - if (node < l->node_count && l->alignment[node]) { - unsigned BA = l->alignment[node]; - unsigned alignment = (BA & 0xffff) - 1; - unsigned bound = BA >> 16; - l->modulus[node] = DIV_ROUND_UP(bound - len + 1, 1 << alignment); - } + if (node < l->node_count && l->alignment[node]) { + unsigned BA = l->alignment[node]; + unsigned alignment = (BA & 0xffff) - 1; + unsigned bound = BA >> 16; + l->modulus[node] = DIV_ROUND_UP(bound - len + 1, 1 << alignment); + } } void -lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j) +lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, + unsigned j, unsigned cmask_j) { - if (i == j) - return; + if (i == j) + return; - if (l->class_disjoint[(l->class[i] * l->class_count) + l->class[j]]) - return; + if (l->class_disjoint[(l->class[i] * l -> class_count) + l->class[j]]) + return; - uint32_t constraint_fw = 0; - uint32_t constraint_bw = 0; + uint32_t constraint_fw = 0; + uint32_t constraint_bw = 0; - for (unsigned D = 0; D < 16; ++D) { - if (cmask_i & (cmask_j << D)) { - constraint_bw |= (1 << (15 + D)); - constraint_fw |= (1 << (15 - D)); - } + for (unsigned D = 0; D < 16; ++D) { + if (cmask_i & (cmask_j << D)) { + constraint_bw |= (1 << (15 + D)); + constraint_fw |= (1 << (15 - D)); + } - if (cmask_i & (cmask_j >> D)) { - constraint_fw |= (1 << (15 + D)); - constraint_bw |= (1 << (15 - D)); - } - } + if (cmask_i & (cmask_j >> D)) { + constraint_fw |= (1 << (15 + D)); + constraint_bw |= (1 << (15 - D)); + } + } - l->linear[j * l->node_count + i] |= constraint_fw; - l->linear[i * l->node_count + j] |= constraint_bw; + l->linear[j * l->node_count + i] |= constraint_fw; + l->linear[i * l->node_count + j] |= constraint_bw; } static bool lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i) { - unsigned *row = &l->linear[i * l->node_count]; - signed constant = solutions[i]; + unsigned *row = &l->linear[i * l->node_count]; + signed constant = solutions[i]; - for (unsigned j = 0; j < l->node_count; ++j) { - if (solutions[j] == ~0) continue; + for (unsigned j = 0; j < l->node_count; ++j) { + if (solutions[j] == ~0) + continue; - signed lhs = solutions[j] - constant; + signed lhs = solutions[j] - constant; - if (lhs < -15 || lhs > 15) - continue; + if (lhs < -15 || lhs > 15) + continue; - if (row[j] & (1 << (lhs + 15))) - return false; - } + if (row[j] & (1 << (lhs + 15))) + return false; + } - return true; + return true; } bool lcra_solve(struct lcra_state *l) { - for (unsigned step = 0; step < l->node_count; ++step) { - if (l->solutions[step] != ~0) continue; - if (l->alignment[step] == 0) continue; + for (unsigned step = 0; step < l->node_count; ++step) { + if (l->solutions[step] != ~0) + continue; + if (l->alignment[step] == 0) + continue; - unsigned _class = l->class[step]; - unsigned class_start = l->class_start[_class]; + unsigned _class = l->class[step]; + unsigned class_start = l->class_start[_class]; - unsigned BA = l->alignment[step]; - unsigned shift = (BA & 0xffff) - 1; - unsigned bound = BA >> 16; + unsigned BA = l->alignment[step]; + unsigned shift = (BA & 0xffff) - 1; + unsigned bound = BA >> 16; - unsigned P = bound >> shift; - unsigned Q = l->modulus[step]; - unsigned r_max = l->class_size[_class]; - unsigned k_max = r_max >> shift; - unsigned m_max = k_max / P; - bool succ = false; + unsigned P = bound >> shift; + unsigned Q = l->modulus[step]; + unsigned r_max = l->class_size[_class]; + unsigned k_max = r_max >> shift; + unsigned m_max = k_max / P; + bool succ = false; - for (unsigned m = 0; m < m_max; ++m) { - for (unsigned n = 0; n < Q; ++n) { - l->solutions[step] = ((m * P + n) << shift) + class_start; - succ = lcra_test_linear(l, l->solutions, step); + for (unsigned m = 0; m < m_max; ++m) { + for (unsigned n = 0; n < Q; ++n) { + l->solutions[step] = ((m * P + n) << shift) + class_start; + succ = lcra_test_linear(l, l->solutions, step); - if (succ) break; - } + if (succ) + break; + } - if (succ) break; - } + if (succ) + break; + } - /* Out of registers - prepare to spill */ - if (!succ) { - l->spill_class = l->class[step]; - return false; - } - } + /* Out of registers - prepare to spill */ + if (!succ) { + l->spill_class = l->class[step]; + return false; + } + } - return true; + return true; } /* Register spilling is implemented with a cost-benefit system. Costs are set @@ -203,45 +210,47 @@ lcra_solve(struct lcra_state *l) void lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost) { - if (node < l->node_count) - l->spill_cost[node] = cost; + if (node < l->node_count) + l->spill_cost[node] = cost; } static unsigned lcra_count_constraints(struct lcra_state *l, unsigned i) { - unsigned count = 0; - unsigned *constraints = &l->linear[i * l->node_count]; + unsigned count = 0; + unsigned *constraints = &l->linear[i * l->node_count]; - for (unsigned j = 0; j < l->node_count; ++j) - count += util_bitcount(constraints[j]); + for (unsigned j = 0; j < l->node_count; ++j) + count += util_bitcount(constraints[j]); - return count; + return count; } signed lcra_get_best_spill_node(struct lcra_state *l) { - /* If there are no constraints on a node, do not pick it to spill under - * any circumstance, or else we would hang rather than fail RA */ - float best_benefit = 0.0; - signed best_node = -1; + /* If there are no constraints on a node, do not pick it to spill under + * any circumstance, or else we would hang rather than fail RA */ + float best_benefit = 0.0; + signed best_node = -1; - for (unsigned i = 0; i < l->node_count; ++i) { - /* Find spillable nodes */ - if (l->class[i] != l->spill_class) continue; - if (l->spill_cost[i] < 0) continue; + for (unsigned i = 0; i < l->node_count; ++i) { + /* Find spillable nodes */ + if (l->class[i] != l->spill_class) + continue; + if (l->spill_cost[i] < 0) + continue; - /* Adapted from Chaitin's heuristic */ - float constraints = lcra_count_constraints(l, i); - float cost = (l->spill_cost[i] + 1); - float benefit = constraints / cost; + /* Adapted from Chaitin's heuristic */ + float constraints = lcra_count_constraints(l, i); + float cost = (l->spill_cost[i] + 1); + float benefit = constraints / cost; - if (benefit > best_benefit) { - best_benefit = benefit; - best_node = i; - } - } + if (benefit > best_benefit) { + best_benefit = benefit; + best_node = i; + } + } - return best_node; + return best_node; } diff --git a/src/panfrost/util/lcra.h b/src/panfrost/util/lcra.h index fd47fdc3543..0b1ed13400f 100644 --- a/src/panfrost/util/lcra.h +++ b/src/panfrost/util/lcra.h @@ -31,78 +31,71 @@ #include struct lcra_state { - unsigned node_count; + unsigned node_count; - /* Alignment for node in log2(bytes)+1. Since alignment must be - * non-negative power-of-two, the elements are strictly positive - * integers. Zero is the sentinel for a missing node. In upper word, - * bound. */ - unsigned *alignment; + /* Alignment for node in log2(bytes)+1. Since alignment must be + * non-negative power-of-two, the elements are strictly positive + * integers. Zero is the sentinel for a missing node. In upper word, + * bound. */ + unsigned *alignment; - /* Linear constraints imposed. Nested array sized upfront, organized as - * linear[node_left][node_right]. That is, calculate indices as: - * - * Each element is itself a bit field denoting whether (c_j - c_i) bias - * is present or not, including negative biases. - * - * Note for Midgard, there are 16 components so the bias is in range - * [-15, 15] so encoded by 32-bit field. */ + /* Linear constraints imposed. Nested array sized upfront, organized as + * linear[node_left][node_right]. That is, calculate indices as: + * + * Each element is itself a bit field denoting whether (c_j - c_i) bias + * is present or not, including negative biases. + * + * Note for Midgard, there are 16 components so the bias is in range + * [-15, 15] so encoded by 32-bit field. */ - uint32_t *linear; + uint32_t *linear; - /* Per node max modulus constraints */ - uint8_t *modulus; + /* Per node max modulus constraints */ + uint8_t *modulus; - /* Classes allow nodes to be partitioned with a starting register. - * Classes cannot interfere; that is, they are true partitions in the - * usual sense of the word. class_count is the number of classes. - * class[] is indexed by a node to get the mapped class. class_start is - * biased to all solutions in the class. */ + /* Classes allow nodes to be partitioned with a starting register. + * Classes cannot interfere; that is, they are true partitions in the + * usual sense of the word. class_count is the number of classes. + * class[] is indexed by a node to get the mapped class. class_start is + * biased to all solutions in the class. */ - unsigned class_count; - unsigned *class; - unsigned *class_start; - unsigned *class_size; - bool *class_disjoint; + unsigned class_count; + unsigned *class; + unsigned *class_start; + unsigned *class_size; + bool *class_disjoint; - /* Before solving, forced registers; after solving, solutions. */ - unsigned *solutions; + /* Before solving, forced registers; after solving, solutions. */ + unsigned *solutions; - /* For register spilling, the costs to spill nodes (as set by the user) - * are in spill_cost[], negative if a node is unspillable. Internally, - * spill_class specifies which class to spill (whichever class failed - * to allocate) */ + /* For register spilling, the costs to spill nodes (as set by the user) + * are in spill_cost[], negative if a node is unspillable. Internally, + * spill_class specifies which class to spill (whichever class failed + * to allocate) */ - signed *spill_cost; - unsigned spill_class; + signed *spill_cost; + unsigned spill_class; }; -struct lcra_state * -lcra_alloc_equations( - unsigned node_count, unsigned class_count); +struct lcra_state *lcra_alloc_equations(unsigned node_count, + unsigned class_count); -void -lcra_free(struct lcra_state *l); +void lcra_free(struct lcra_state *l); -void -lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2); +void lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2); -void -lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2, unsigned bound); +void lcra_set_alignment(struct lcra_state *l, unsigned node, + unsigned align_log2, unsigned bound); -void -lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len); +void lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len); -void -lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j); +void lcra_add_node_interference(struct lcra_state *l, unsigned i, + unsigned cmask_i, unsigned j, unsigned cmask_j); -bool -lcra_solve(struct lcra_state *l); +bool lcra_solve(struct lcra_state *l); -void -lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost); +void lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost); -signed -lcra_get_best_spill_node(struct lcra_state *l); +signed lcra_get_best_spill_node(struct lcra_state *l); #endif diff --git a/src/panfrost/util/nir_mod_helpers.c b/src/panfrost/util/nir_mod_helpers.c index 2fe7b4fabe5..b8baf720ced 100644 --- a/src/panfrost/util/nir_mod_helpers.c +++ b/src/panfrost/util/nir_mod_helpers.c @@ -34,12 +34,13 @@ * ALU source (principally fneg or fabs). If so, return true and rewrite the * source to be the argument, respecting swizzles as needed. If not (or it * cannot be proven), return false and leave the source untouched. -*/ + */ bool pan_has_source_mod(nir_alu_src *src, nir_op op) { - if (!src->src.is_ssa || src->src.ssa->parent_instr->type != nir_instr_type_alu) + if (!src->src.is_ssa || + src->src.ssa->parent_instr->type != nir_instr_type_alu) return false; nir_alu_instr *alu = nir_instr_as_alu(src->src.ssa->parent_instr); @@ -56,7 +57,7 @@ pan_has_source_mod(nir_alu_src *src, nir_op op) /* Okay - we've found the modifier we wanted. Let's construct the new ALU * src. In a scalar world, this is just psrc, but for vector archs we need - * to respect the swizzle, so we compose. + * to respect the swizzle, so we compose. */ nir_alu_src nsrc = { diff --git a/src/panfrost/util/pan_collect_varyings.c b/src/panfrost/util/pan_collect_varyings.c index 6aa85b7089f..505aac085c7 100644 --- a/src/panfrost/util/pan_collect_varyings.c +++ b/src/panfrost/util/pan_collect_varyings.c @@ -29,165 +29,162 @@ static enum pipe_format varying_format(nir_alu_type t, unsigned ncomps) { - assert(ncomps >= 1 && ncomps <= 4); + assert(ncomps >= 1 && ncomps <= 4); -#define VARYING_FORMAT(ntype, nsz, ptype, psz) \ - { \ - .type = nir_type_ ## ntype ## nsz, \ - .formats = { \ - PIPE_FORMAT_R ## psz ## _ ## ptype, \ - PIPE_FORMAT_R ## psz ## G ## psz ## _ ## ptype, \ - PIPE_FORMAT_R ## psz ## G ## psz ## B ## psz ## _ ## ptype, \ - PIPE_FORMAT_R ## psz ## G ## psz ## B ## psz ## A ## psz ## _ ## ptype, \ - } \ - } +#define VARYING_FORMAT(ntype, nsz, ptype, psz) \ + { \ + .type = nir_type_##ntype##nsz, .formats = { \ + PIPE_FORMAT_R##psz##_##ptype, \ + PIPE_FORMAT_R##psz##G##psz##_##ptype, \ + PIPE_FORMAT_R##psz##G##psz##B##psz##_##ptype, \ + PIPE_FORMAT_R##psz##G##psz##B##psz##A##psz##_##ptype, \ + } \ + } - static const struct { - nir_alu_type type; - enum pipe_format formats[4]; - } conv[] = { - VARYING_FORMAT(float, 32, FLOAT, 32), - VARYING_FORMAT(uint, 32, UINT, 32), - VARYING_FORMAT(float, 16, FLOAT, 16), - }; + static const struct { + nir_alu_type type; + enum pipe_format formats[4]; + } conv[] = { + VARYING_FORMAT(float, 32, FLOAT, 32), + VARYING_FORMAT(uint, 32, UINT, 32), + VARYING_FORMAT(float, 16, FLOAT, 16), + }; #undef VARYING_FORMAT - assert(ncomps > 0 && ncomps <= ARRAY_SIZE(conv[0].formats)); + assert(ncomps > 0 && ncomps <= ARRAY_SIZE(conv[0].formats)); - for (unsigned i = 0; i < ARRAY_SIZE(conv); i++) { - if (conv[i].type == t) - return conv[i].formats[ncomps - 1]; - } + for (unsigned i = 0; i < ARRAY_SIZE(conv); i++) { + if (conv[i].type == t) + return conv[i].formats[ncomps - 1]; + } - unreachable("Invalid type"); + unreachable("Invalid type"); } struct slot_info { - nir_alu_type type; - unsigned count; - unsigned index; + nir_alu_type type; + unsigned count; + unsigned index; }; static bool walk_varyings(UNUSED nir_builder *b, nir_instr *instr, void *data) { - struct slot_info *slots = data; + struct slot_info *slots = data; - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - unsigned count; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + unsigned count; - /* Only consider intrinsics that access varyings */ - switch (intr->intrinsic) { - case nir_intrinsic_store_output: - if (b->shader->info.stage != MESA_SHADER_VERTEX) - return false; + /* Only consider intrinsics that access varyings */ + switch (intr->intrinsic) { + case nir_intrinsic_store_output: + if (b->shader->info.stage != MESA_SHADER_VERTEX) + return false; - count = nir_src_num_components(intr->src[0]); - break; + count = nir_src_num_components(intr->src[0]); + break; - case nir_intrinsic_load_input: - case nir_intrinsic_load_interpolated_input: - if (b->shader->info.stage != MESA_SHADER_FRAGMENT) - return false; + case nir_intrinsic_load_input: + case nir_intrinsic_load_interpolated_input: + if (b->shader->info.stage != MESA_SHADER_FRAGMENT) + return false; - count = nir_dest_num_components(intr->dest); - break; + count = nir_dest_num_components(intr->dest); + break; - default: - return false; - } + default: + return false; + } - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - if (sem.no_varying) - return false; + if (sem.no_varying) + return false; - /* In a fragment shader, flat shading is lowered to load_input but - * interpolation is lowered to load_interpolated_input, so we can check - * the intrinsic to distinguish. - * - * In a vertex shader, we consider everything flat, as the information - * will not contribute to the final linked varyings -- flatness is used - * only to determine the type, and the GL linker uses the type from the - * fragment shader instead. - */ - bool flat = (intr->intrinsic != nir_intrinsic_load_interpolated_input); - nir_alu_type type = flat ? nir_type_uint : nir_type_float; + /* In a fragment shader, flat shading is lowered to load_input but + * interpolation is lowered to load_interpolated_input, so we can check + * the intrinsic to distinguish. + * + * In a vertex shader, we consider everything flat, as the information + * will not contribute to the final linked varyings -- flatness is used + * only to determine the type, and the GL linker uses the type from the + * fragment shader instead. + */ + bool flat = (intr->intrinsic != nir_intrinsic_load_interpolated_input); + nir_alu_type type = flat ? nir_type_uint : nir_type_float; - /* Demote interpolated float varyings to fp16 where possible. We do not - * demote flat varyings, including integer varyings, due to various - * issues with the Midgard hardware behaviour and TGSI shaders, as well - * as having no demonstrable benefit in practice. - */ - if (type == nir_type_float && sem.medium_precision) - type |= 16; - else - type |= 32; + /* Demote interpolated float varyings to fp16 where possible. We do not + * demote flat varyings, including integer varyings, due to various + * issues with the Midgard hardware behaviour and TGSI shaders, as well + * as having no demonstrable benefit in practice. + */ + if (type == nir_type_float && sem.medium_precision) + type |= 16; + else + type |= 32; - /* Count currently contains the number of components accessed by this - * intrinsics. However, we may be accessing a fractional location, - * indicating by the NIR component. Add that in. The final value be the - * maximum (component + count), an upper bound on the number of - * components possibly used. - */ - count += nir_intrinsic_component(intr); + /* Count currently contains the number of components accessed by this + * intrinsics. However, we may be accessing a fractional location, + * indicating by the NIR component. Add that in. The final value be the + * maximum (component + count), an upper bound on the number of + * components possibly used. + */ + count += nir_intrinsic_component(intr); - /* Consider each slot separately */ - for (unsigned offset = 0; offset < sem.num_slots; ++offset) { - unsigned location = sem.location + offset; - unsigned index = nir_intrinsic_base(intr) + offset; + /* Consider each slot separately */ + for (unsigned offset = 0; offset < sem.num_slots; ++offset) { + unsigned location = sem.location + offset; + unsigned index = nir_intrinsic_base(intr) + offset; - if (slots[location].type) { - assert(slots[location].type == type); - assert(slots[location].index == index); - } else { - slots[location].type = type; - slots[location].index = index; - } + if (slots[location].type) { + assert(slots[location].type == type); + assert(slots[location].index == index); + } else { + slots[location].type = type; + slots[location].index = index; + } - slots[location].count = MAX2(slots[location].count, count); - } + slots[location].count = MAX2(slots[location].count, count); + } - return false; + return false; } void pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info) { - if (s->info.stage != MESA_SHADER_VERTEX && - s->info.stage != MESA_SHADER_FRAGMENT) - return; + if (s->info.stage != MESA_SHADER_VERTEX && + s->info.stage != MESA_SHADER_FRAGMENT) + return; - struct slot_info slots[64] = { 0 }; - nir_shader_instructions_pass(s, walk_varyings, nir_metadata_all, slots); + struct slot_info slots[64] = {0}; + nir_shader_instructions_pass(s, walk_varyings, nir_metadata_all, slots); - struct pan_shader_varying *varyings = - (s->info.stage == MESA_SHADER_VERTEX) ? - info->varyings.output : - info->varyings.input; + struct pan_shader_varying *varyings = (s->info.stage == MESA_SHADER_VERTEX) + ? info->varyings.output + : info->varyings.input; - unsigned count = 0; + unsigned count = 0; - for (unsigned i = 0; i < ARRAY_SIZE(slots); ++i) { - if (!slots[i].type) - continue; + for (unsigned i = 0; i < ARRAY_SIZE(slots); ++i) { + if (!slots[i].type) + continue; - enum pipe_format format = - varying_format(slots[i].type, slots[i].count); - assert(format != PIPE_FORMAT_NONE); + enum pipe_format format = varying_format(slots[i].type, slots[i].count); + assert(format != PIPE_FORMAT_NONE); - unsigned index = slots[i].index; - count = MAX2(count, index + 1); + unsigned index = slots[i].index; + count = MAX2(count, index + 1); - varyings[index].location = i; - varyings[index].format = format; - } + varyings[index].location = i; + varyings[index].format = format; + } - if (s->info.stage == MESA_SHADER_VERTEX) - info->varyings.output_count = count; - else - info->varyings.input_count = count; + if (s->info.stage == MESA_SHADER_VERTEX) + info->varyings.output_count = count; + else + info->varyings.input_count = count; } diff --git a/src/panfrost/util/pan_ir.c b/src/panfrost/util/pan_ir.c index c469274933f..8524e08bc84 100644 --- a/src/panfrost/util/pan_ir.c +++ b/src/panfrost/util/pan_ir.c @@ -32,73 +32,66 @@ uint16_t pan_to_bytemask(unsigned bytes, unsigned mask) { - switch (bytes) { - case 0: - assert(mask == 0); - return 0; + switch (bytes) { + case 0: + assert(mask == 0); + return 0; - case 8: - return mask; + case 8: + return mask; - case 16: { - unsigned space = - (mask & 0x1) | - ((mask & 0x2) << (2 - 1)) | - ((mask & 0x4) << (4 - 2)) | - ((mask & 0x8) << (6 - 3)) | - ((mask & 0x10) << (8 - 4)) | - ((mask & 0x20) << (10 - 5)) | - ((mask & 0x40) << (12 - 6)) | - ((mask & 0x80) << (14 - 7)); + case 16: { + unsigned space = + (mask & 0x1) | ((mask & 0x2) << (2 - 1)) | ((mask & 0x4) << (4 - 2)) | + ((mask & 0x8) << (6 - 3)) | ((mask & 0x10) << (8 - 4)) | + ((mask & 0x20) << (10 - 5)) | ((mask & 0x40) << (12 - 6)) | + ((mask & 0x80) << (14 - 7)); - return space | (space << 1); - } + return space | (space << 1); + } - case 32: { - unsigned space = - (mask & 0x1) | - ((mask & 0x2) << (4 - 1)) | - ((mask & 0x4) << (8 - 2)) | - ((mask & 0x8) << (12 - 3)); + case 32: { + unsigned space = (mask & 0x1) | ((mask & 0x2) << (4 - 1)) | + ((mask & 0x4) << (8 - 2)) | ((mask & 0x8) << (12 - 3)); - return space | (space << 1) | (space << 2) | (space << 3); - } + return space | (space << 1) | (space << 2) | (space << 3); + } - case 64: { - unsigned A = (mask & 0x1) ? 0xFF : 0x00; - unsigned B = (mask & 0x2) ? 0xFF : 0x00; - return A | (B << 8); - } + case 64: { + unsigned A = (mask & 0x1) ? 0xFF : 0x00; + unsigned B = (mask & 0x2) ? 0xFF : 0x00; + return A | (B << 8); + } - default: - unreachable("Invalid register mode"); - } + default: + unreachable("Invalid register mode"); + } } void pan_block_add_successor(pan_block *block, pan_block *successor) { - assert(block); - assert(successor); + assert(block); + assert(successor); - /* Cull impossible edges */ - if (block->unconditional_jumps) - return; + /* Cull impossible edges */ + if (block->unconditional_jumps) + return; - for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { - if (block->successors[i]) { - if (block->successors[i] == successor) - return; - else - continue; - } + for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { + if (block->successors[i]) { + if (block->successors[i] == successor) + return; + else + continue; + } - block->successors[i] = successor; - _mesa_set_add(successor->predecessors, block); - return; - } + block->successors[i] = successor; + _mesa_set_add(successor->predecessors, block); + return; + } - unreachable("Too many successors"); + unreachable("Too many successors"); } /* Prints a NIR ALU type in Bifrost-style ".f32" ".i8" etc */ @@ -106,45 +99,42 @@ pan_block_add_successor(pan_block *block, pan_block *successor) void pan_print_alu_type(nir_alu_type t, FILE *fp) { - unsigned size = nir_alu_type_get_type_size(t); - nir_alu_type base = nir_alu_type_get_base_type(t); + unsigned size = nir_alu_type_get_type_size(t); + nir_alu_type base = nir_alu_type_get_base_type(t); - switch (base) { - case nir_type_int: - fprintf(fp, ".i"); - break; - case nir_type_uint: - fprintf(fp, ".u"); - break; - case nir_type_bool: - fprintf(fp, ".b"); - break; - case nir_type_float: - fprintf(fp, ".f"); - break; - default: - fprintf(fp, ".unknown"); - break; - } + switch (base) { + case nir_type_int: + fprintf(fp, ".i"); + break; + case nir_type_uint: + fprintf(fp, ".u"); + break; + case nir_type_bool: + fprintf(fp, ".b"); + break; + case nir_type_float: + fprintf(fp, ".f"); + break; + default: + fprintf(fp, ".unknown"); + break; + } - fprintf(fp, "%u", size); + fprintf(fp, "%u", size); } /* Could optimize with a better data structure if anyone cares, TODO: profile */ unsigned -pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs) +pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, + unsigned offs) { - struct panfrost_ubo_word word = { - .ubo = ubo, - .offset = offs - }; + struct panfrost_ubo_word word = {.ubo = ubo, .offset = offs}; - for (unsigned i = 0; i < push->count; ++i) { - if (memcmp(push->words + i, &word, sizeof(word)) == 0) - return i; - } - - unreachable("UBO not pushed"); + for (unsigned i = 0; i < push->count; ++i) { + if (memcmp(push->words + i, &word, sizeof(word)) == 0) + return i; + } + unreachable("UBO not pushed"); } diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 8b41668ea66..3ddec343856 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -26,8 +26,8 @@ #include #include "compiler/nir/nir.h" -#include "util/u_dynarray.h" #include "util/hash_table.h" +#include "util/u_dynarray.h" /* On Valhall, the driver gives the hardware a table of resource tables. * Resources are addressed as the index of the table together with the index of @@ -38,14 +38,14 @@ * Gallium driver and the Valhall compiler. */ enum pan_resource_table { - PAN_TABLE_UBO = 0, - PAN_TABLE_ATTRIBUTE, - PAN_TABLE_ATTRIBUTE_BUFFER, - PAN_TABLE_SAMPLER, - PAN_TABLE_TEXTURE, - PAN_TABLE_IMAGE, + PAN_TABLE_UBO = 0, + PAN_TABLE_ATTRIBUTE, + PAN_TABLE_ATTRIBUTE_BUFFER, + PAN_TABLE_SAMPLER, + PAN_TABLE_TEXTURE, + PAN_TABLE_IMAGE, - PAN_NUM_RESOURCE_TABLES + PAN_NUM_RESOURCE_TABLES }; /* Indices for named (non-XFB) varyings that are present. These are packed @@ -61,15 +61,15 @@ enum pan_resource_table { */ enum pan_special_varying { - PAN_VARY_GENERAL = 0, - PAN_VARY_POSITION = 1, - PAN_VARY_PSIZ = 2, - PAN_VARY_PNTCOORD = 3, - PAN_VARY_FACE = 4, - PAN_VARY_FRAGCOORD = 5, + PAN_VARY_GENERAL = 0, + PAN_VARY_POSITION = 1, + PAN_VARY_PSIZ = 2, + PAN_VARY_PNTCOORD = 3, + PAN_VARY_FACE = 4, + PAN_VARY_FRAGCOORD = 5, - /* Keep last */ - PAN_VARY_MAX, + /* Keep last */ + PAN_VARY_MAX, }; /* Maximum number of attribute descriptors required for varyings. These include @@ -84,53 +84,49 @@ enum pan_special_varying { /* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal * their class for equal comparison */ -#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type) -#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff) -#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16) +#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type) +#define PAN_SYSVAL_TYPE(sysval) ((sysval)&0xffff) +#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16) /* Define some common types. We start at one for easy indexing of hash * tables internal to the compiler */ enum { - PAN_SYSVAL_VIEWPORT_SCALE = 1, - PAN_SYSVAL_VIEWPORT_OFFSET = 2, - PAN_SYSVAL_TEXTURE_SIZE = 3, - PAN_SYSVAL_SSBO = 4, - PAN_SYSVAL_NUM_WORK_GROUPS = 5, - PAN_SYSVAL_SAMPLER = 7, - PAN_SYSVAL_LOCAL_GROUP_SIZE = 8, - PAN_SYSVAL_WORK_DIM = 9, - PAN_SYSVAL_IMAGE_SIZE = 10, - PAN_SYSVAL_SAMPLE_POSITIONS = 11, - PAN_SYSVAL_MULTISAMPLED = 12, - PAN_SYSVAL_RT_CONVERSION = 13, - PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS = 14, - PAN_SYSVAL_DRAWID = 15, - PAN_SYSVAL_BLEND_CONSTANTS = 16, - PAN_SYSVAL_XFB = 17, - PAN_SYSVAL_NUM_VERTICES = 18, + PAN_SYSVAL_VIEWPORT_SCALE = 1, + PAN_SYSVAL_VIEWPORT_OFFSET = 2, + PAN_SYSVAL_TEXTURE_SIZE = 3, + PAN_SYSVAL_SSBO = 4, + PAN_SYSVAL_NUM_WORK_GROUPS = 5, + PAN_SYSVAL_SAMPLER = 7, + PAN_SYSVAL_LOCAL_GROUP_SIZE = 8, + PAN_SYSVAL_WORK_DIM = 9, + PAN_SYSVAL_IMAGE_SIZE = 10, + PAN_SYSVAL_SAMPLE_POSITIONS = 11, + PAN_SYSVAL_MULTISAMPLED = 12, + PAN_SYSVAL_RT_CONVERSION = 13, + PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS = 14, + PAN_SYSVAL_DRAWID = 15, + PAN_SYSVAL_BLEND_CONSTANTS = 16, + PAN_SYSVAL_XFB = 17, + PAN_SYSVAL_NUM_VERTICES = 18, }; -#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \ - ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0)) +#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \ + ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0)) -#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id) & 0x7f) -#define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3) -#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9)) +#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id)&0x7f) +#define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3) +#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9)) /* Special attribute slots for vertex builtins. Sort of arbitrary but let's be * consistent with the blob so we can compare traces easier. */ -enum { - PAN_VERTEX_ID = 16, - PAN_INSTANCE_ID = 17, - PAN_MAX_ATTRIBUTE -}; +enum { PAN_VERTEX_ID = 16, PAN_INSTANCE_ID = 17, PAN_MAX_ATTRIBUTE }; struct panfrost_sysvals { - /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ - unsigned sysvals[MAX_SYSVAL_COUNT]; - unsigned sysval_count; + /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ + unsigned sysvals[MAX_SYSVAL_COUNT]; + unsigned sysval_count; }; /* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each. @@ -149,82 +145,78 @@ struct panfrost_sysvals { * an offset to a word must be < 2^16. There are less than 2^8 UBOs */ struct panfrost_ubo_word { - uint16_t ubo; - uint16_t offset; + uint16_t ubo; + uint16_t offset; }; struct panfrost_ubo_push { - unsigned count; - struct panfrost_ubo_word words[PAN_MAX_PUSH]; + unsigned count; + struct panfrost_ubo_word words[PAN_MAX_PUSH]; }; /* Helper for searching the above. Note this is O(N) to the number of pushed * constants, do not run in the draw call hot path */ -unsigned -pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs); +unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, + unsigned offs); struct hash_table_u64 * panfrost_init_sysvals(struct panfrost_sysvals *sysvals, - struct panfrost_sysvals *fixed_sysvals, - void *memctx); + struct panfrost_sysvals *fixed_sysvals, void *memctx); -unsigned -pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, - struct panfrost_sysvals *sysvals, - int sysval); +unsigned pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, + struct panfrost_sysvals *sysvals, int sysval); -int -panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest); +int panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest); struct panfrost_compile_inputs { - struct util_debug_callback *debug; + struct util_debug_callback *debug; - unsigned gpu_id; - bool is_blend, is_blit; - struct { - unsigned rt; - unsigned nr_samples; - uint64_t bifrost_blend_desc; - } blend; - int fixed_sysval_ubo; - struct panfrost_sysvals *fixed_sysval_layout; - bool no_idvs; - bool no_ubo_to_push; + unsigned gpu_id; + bool is_blend, is_blit; + struct { + unsigned rt; + unsigned nr_samples; + uint64_t bifrost_blend_desc; + } blend; + int fixed_sysval_ubo; + struct panfrost_sysvals *fixed_sysval_layout; + bool no_idvs; + bool no_ubo_to_push; - enum pipe_format rt_formats[8]; - uint8_t raw_fmt_mask; - unsigned nr_cbufs; + enum pipe_format rt_formats[8]; + uint8_t raw_fmt_mask; + unsigned nr_cbufs; - /* Used on Valhall. - * - * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0) - * written by the previous stage (fragment shader) or written by this - * stage (vertex shader). Bits are slots from gl_varying_slot. - * - * For modern APIs (GLES or VK), this should be 0. - */ - uint32_t fixed_varying_mask; + /* Used on Valhall. + * + * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0) + * written by the previous stage (fragment shader) or written by this + * stage (vertex shader). Bits are slots from gl_varying_slot. + * + * For modern APIs (GLES or VK), this should be 0. + */ + uint32_t fixed_varying_mask; - union { - struct { - bool static_rt_conv; - uint32_t rt_conv[8]; - } bifrost; - }; + union { + struct { + bool static_rt_conv; + uint32_t rt_conv[8]; + } bifrost; + }; }; struct pan_shader_varying { - gl_varying_slot location; - enum pipe_format format; + gl_varying_slot location; + enum pipe_format format; }; struct bifrost_shader_blend_info { - nir_alu_type type; - uint32_t return_offset; + nir_alu_type type; + uint32_t return_offset; - /* mali_bifrost_register_file_format corresponding to nir_alu_type */ - unsigned format; + /* mali_bifrost_register_file_format corresponding to nir_alu_type */ + unsigned format; }; /* @@ -234,227 +226,226 @@ struct bifrost_shader_blend_info { * GenXML. */ struct bifrost_message_preload { - /* Whether to preload this message */ - bool enabled; + /* Whether to preload this message */ + bool enabled; - /* Varying to load from */ - unsigned varying_index; + /* Varying to load from */ + unsigned varying_index; - /* Register type, FP32 otherwise */ - bool fp16; + /* Register type, FP32 otherwise */ + bool fp16; - /* Number of components, ignored if texturing */ - unsigned num_components; + /* Number of components, ignored if texturing */ + unsigned num_components; - /* If texture is set, performs a texture instruction according to - * texture_index, skip, and zero_lod. If texture is unset, only the - * varying load is performed. - */ - bool texture, skip, zero_lod; - unsigned texture_index; + /* If texture is set, performs a texture instruction according to + * texture_index, skip, and zero_lod. If texture is unset, only the + * varying load is performed. + */ + bool texture, skip, zero_lod; + unsigned texture_index; }; struct bifrost_shader_info { - struct bifrost_shader_blend_info blend[8]; - nir_alu_type blend_src1_type; - bool wait_6, wait_7; - struct bifrost_message_preload messages[2]; + struct bifrost_shader_blend_info blend[8]; + nir_alu_type blend_src1_type; + bool wait_6, wait_7; + struct bifrost_message_preload messages[2]; - /* Whether any flat varyings are loaded. This may disable optimizations - * that change the provoking vertex, since that would load incorrect - * values for flat varyings. - */ - bool uses_flat_shading; + /* Whether any flat varyings are loaded. This may disable optimizations + * that change the provoking vertex, since that would load incorrect + * values for flat varyings. + */ + bool uses_flat_shading; }; struct midgard_shader_info { - unsigned first_tag; + unsigned first_tag; }; struct pan_shader_info { - gl_shader_stage stage; - unsigned work_reg_count; - unsigned tls_size; - unsigned wls_size; + gl_shader_stage stage; + unsigned work_reg_count; + unsigned tls_size; + unsigned wls_size; - /* Bit mask of preloaded registers */ - uint64_t preload; + /* Bit mask of preloaded registers */ + uint64_t preload; - union { - struct { - bool reads_frag_coord; - bool reads_point_coord; - bool reads_face; - bool can_discard; - bool writes_depth; - bool writes_stencil; - bool writes_coverage; - bool sidefx; - bool sample_shading; - bool early_fragment_tests; - bool can_early_z, can_fpk; - bool untyped_color_outputs; - BITSET_WORD outputs_read; - BITSET_WORD outputs_written; - } fs; + union { + struct { + bool reads_frag_coord; + bool reads_point_coord; + bool reads_face; + bool can_discard; + bool writes_depth; + bool writes_stencil; + bool writes_coverage; + bool sidefx; + bool sample_shading; + bool early_fragment_tests; + bool can_early_z, can_fpk; + bool untyped_color_outputs; + BITSET_WORD outputs_read; + BITSET_WORD outputs_written; + } fs; - struct { - bool writes_point_size; + struct { + bool writes_point_size; - /* If the primary shader writes point size, the Valhall - * driver may need a variant that does not write point - * size. Offset to such a shader in the program binary. - * - * Zero if no such variant is required. - * - * Only used with IDVS on Valhall. - */ - unsigned no_psiz_offset; + /* If the primary shader writes point size, the Valhall + * driver may need a variant that does not write point + * size. Offset to such a shader in the program binary. + * + * Zero if no such variant is required. + * + * Only used with IDVS on Valhall. + */ + unsigned no_psiz_offset; - /* Set if Index-Driven Vertex Shading is in use */ - bool idvs; + /* Set if Index-Driven Vertex Shading is in use */ + bool idvs; - /* If IDVS is used, whether a varying shader is used */ - bool secondary_enable; + /* If IDVS is used, whether a varying shader is used */ + bool secondary_enable; - /* If a varying shader is used, the varying shader's - * offset in the program binary - */ - unsigned secondary_offset; + /* If a varying shader is used, the varying shader's + * offset in the program binary + */ + unsigned secondary_offset; - /* If IDVS is in use, number of work registers used by - * the varying shader - */ - unsigned secondary_work_reg_count; + /* If IDVS is in use, number of work registers used by + * the varying shader + */ + unsigned secondary_work_reg_count; - /* If IDVS is in use, bit mask of preloaded registers - * used by the varying shader - */ - uint64_t secondary_preload; - } vs; + /* If IDVS is in use, bit mask of preloaded registers + * used by the varying shader + */ + uint64_t secondary_preload; + } vs; - struct { - /* Is it legal to merge workgroups? This is true if the - * shader uses neither barriers nor shared memory. This - * requires caution: if the API allows specifying shared - * memory at launch time (instead of compile time), that - * memory will not be accounted for by the compiler. - * - * Used by the Valhall hardware. - */ - bool allow_merging_workgroups; - } cs; - }; + struct { + /* Is it legal to merge workgroups? This is true if the + * shader uses neither barriers nor shared memory. This + * requires caution: if the API allows specifying shared + * memory at launch time (instead of compile time), that + * memory will not be accounted for by the compiler. + * + * Used by the Valhall hardware. + */ + bool allow_merging_workgroups; + } cs; + }; - /* Does the shader contains a barrier? or (for fragment shaders) does it - * require helper invocations, which demand the same ordering guarantees - * of the hardware? These notions are unified in the hardware, so we - * unify them here as well. - */ - bool contains_barrier; - bool separable; - bool writes_global; - uint64_t outputs_written; + /* Does the shader contains a barrier? or (for fragment shaders) does it + * require helper invocations, which demand the same ordering guarantees + * of the hardware? These notions are unified in the hardware, so we + * unify them here as well. + */ + bool contains_barrier; + bool separable; + bool writes_global; + uint64_t outputs_written; - /* Floating point controls that the driver should try to honour */ - bool ftz_fp16, ftz_fp32; + /* Floating point controls that the driver should try to honour */ + bool ftz_fp16, ftz_fp32; - unsigned sampler_count; - unsigned texture_count; - unsigned ubo_count; - unsigned attributes_read_count; - unsigned attribute_count; - unsigned attributes_read; + unsigned sampler_count; + unsigned texture_count; + unsigned ubo_count; + unsigned attributes_read_count; + unsigned attribute_count; + unsigned attributes_read; - struct { - unsigned input_count; - struct pan_shader_varying input[PAN_MAX_VARYINGS]; - unsigned output_count; - struct pan_shader_varying output[PAN_MAX_VARYINGS]; - } varyings; + struct { + unsigned input_count; + struct pan_shader_varying input[PAN_MAX_VARYINGS]; + unsigned output_count; + struct pan_shader_varying output[PAN_MAX_VARYINGS]; + } varyings; - struct panfrost_sysvals sysvals; + struct panfrost_sysvals sysvals; - /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access - * Uniforms (Bifrost) */ - struct panfrost_ubo_push push; + /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access + * Uniforms (Bifrost) */ + struct panfrost_ubo_push push; - uint32_t ubo_mask; + uint32_t ubo_mask; - union { - struct bifrost_shader_info bifrost; - struct midgard_shader_info midgard; - }; + union { + struct bifrost_shader_info bifrost; + struct midgard_shader_info midgard; + }; }; typedef struct pan_block { - /* Link to next block. Must be first for mir_get_block */ - struct list_head link; + /* Link to next block. Must be first for mir_get_block */ + struct list_head link; - /* List of instructions emitted for the current block */ - struct list_head instructions; + /* List of instructions emitted for the current block */ + struct list_head instructions; - /* Index of the block in source order */ - unsigned name; + /* Index of the block in source order */ + unsigned name; - /* Control flow graph */ - struct pan_block *successors[2]; - struct set *predecessors; - bool unconditional_jumps; + /* Control flow graph */ + struct pan_block *successors[2]; + struct set *predecessors; + bool unconditional_jumps; - /* In liveness analysis, these are live masks (per-component) for - * indices for the block. Scalar compilers have the luxury of using - * simple bit fields, but for us, liveness is a vector idea. */ - uint16_t *live_in; - uint16_t *live_out; + /* In liveness analysis, these are live masks (per-component) for + * indices for the block. Scalar compilers have the luxury of using + * simple bit fields, but for us, liveness is a vector idea. */ + uint16_t *live_in; + uint16_t *live_out; } pan_block; struct pan_instruction { - struct list_head link; + struct list_head link; }; -#define pan_foreach_instr_in_block_rev(block, v) \ - list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, link) +#define pan_foreach_instr_in_block_rev(block, v) \ + list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, \ + link) -#define pan_foreach_successor(blk, v) \ - pan_block *v; \ - pan_block **_v; \ - for (_v = (pan_block **) &blk->successors[0], \ - v = *_v; \ - v != NULL && _v < (pan_block **) &blk->successors[2]; \ - _v++, v = *_v) \ +#define pan_foreach_successor(blk, v) \ + pan_block *v; \ + pan_block **_v; \ + for (_v = (pan_block **)&blk->successors[0], v = *_v; \ + v != NULL && _v < (pan_block **)&blk->successors[2]; _v++, v = *_v) -#define pan_foreach_predecessor(blk, v) \ - struct set_entry *_entry_##v; \ - struct pan_block *v; \ - for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \ - v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL); \ - _entry_##v != NULL; \ - _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \ - v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL)) +#define pan_foreach_predecessor(blk, v) \ + struct set_entry *_entry_##v; \ + struct pan_block *v; \ + for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \ + v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL); \ + _entry_##v != NULL; \ + _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \ + v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL)) static inline pan_block * pan_exit_block(struct list_head *blocks) { - pan_block *last = list_last_entry(blocks, pan_block, link); - assert(!last->successors[0] && !last->successors[1]); - return last; + pan_block *last = list_last_entry(blocks, pan_block, link); + assert(!last->successors[0] && !last->successors[1]); + return last; } typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max); -void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask); -void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask); +void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, + uint16_t mask); +void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, + uint16_t mask); bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max); -void pan_compute_liveness(struct list_head *blocks, - unsigned temp_count, - pan_liveness_update callback); +void pan_compute_liveness(struct list_head *blocks, unsigned temp_count, + pan_liveness_update callback); void pan_free_liveness(struct list_head *blocks); -uint16_t -pan_to_bytemask(unsigned bytes, unsigned mask); +uint16_t pan_to_bytemask(unsigned bytes, unsigned mask); void pan_block_add_successor(pan_block *block, pan_block *successor); @@ -464,30 +455,30 @@ void pan_block_add_successor(pan_block *block, pan_block *successor); static inline unsigned pan_ssa_index(nir_ssa_def *ssa) { - /* Off-by-one ensures BIR_NO_ARG is skipped */ - return ((ssa->index + 1) << 1) | 0; + /* Off-by-one ensures BIR_NO_ARG is skipped */ + return ((ssa->index + 1) << 1) | 0; } static inline unsigned pan_src_index(nir_src *src) { - if (src->is_ssa) - return pan_ssa_index(src->ssa); - else { - assert(!src->reg.indirect); - return (src->reg.reg->index << 1) | PAN_IS_REG; - } + if (src->is_ssa) + return pan_ssa_index(src->ssa); + else { + assert(!src->reg.indirect); + return (src->reg.reg->index << 1) | PAN_IS_REG; + } } static inline unsigned pan_dest_index(nir_dest *dst) { - if (dst->is_ssa) - return pan_ssa_index(&dst->ssa); - else { - assert(!dst->reg.indirect); - return (dst->reg.reg->index << 1) | PAN_IS_REG; - } + if (dst->is_ssa) + return pan_ssa_index(&dst->ssa); + else { + assert(!dst->reg.indirect); + return (dst->reg.reg->index << 1) | PAN_IS_REG; + } } /* IR printing helpers */ @@ -523,14 +514,14 @@ void pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info); static inline unsigned pan_subgroup_size(unsigned arch) { - if (arch >= 9) - return 16; - else if (arch >= 7) - return 8; - else if (arch >= 6) - return 4; - else - return 1; + if (arch >= 9) + return 16; + else if (arch >= 7) + return 8; + else if (arch >= 6) + return 4; + else + return 1; } #endif diff --git a/src/panfrost/util/pan_liveness.c b/src/panfrost/util/pan_liveness.c index 0ec9652b59d..e299bc29d75 100644 --- a/src/panfrost/util/pan_liveness.c +++ b/src/panfrost/util/pan_liveness.c @@ -21,10 +21,10 @@ * SOFTWARE. */ -#include "pan_ir.h" -#include "util/u_memory.h" #include "util/list.h" #include "util/set.h" +#include "util/u_memory.h" +#include "pan_ir.h" /* Routines for liveness analysis. Liveness is tracked per byte per node. Per * byte granularity is necessary for proper handling of int8 */ @@ -32,28 +32,28 @@ void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask) { - if (node >= max) - return; + if (node >= max) + return; - live[node] |= mask; + live[node] |= mask; } void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask) { - if (node >= max) - return; + if (node >= max) + return; - live[node] &= ~mask; + live[node] &= ~mask; } bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max) { - if (node >= max) - return false; + if (node >= max) + return false; - return live[node]; + return live[node]; } /* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */ @@ -61,10 +61,10 @@ pan_liveness_get(uint16_t *live, unsigned node, uint16_t max) static void liveness_block_live_out(pan_block *blk, unsigned temp_count) { - pan_foreach_successor(blk, succ) { - for (unsigned i = 0; i < temp_count; ++i) - blk->live_out[i] |= succ->live_in[i]; - } + pan_foreach_successor(blk, succ) { + for (unsigned i = 0; i < temp_count; ++i) + blk->live_out[i] |= succ->live_in[i]; + } } /* Liveness analysis is a backwards-may dataflow analysis pass. Within a block, @@ -72,32 +72,30 @@ liveness_block_live_out(pan_block *blk, unsigned temp_count) * returns whether progress was made. */ static bool -liveness_block_update( - pan_block *blk, unsigned temp_count, - pan_liveness_update callback) +liveness_block_update(pan_block *blk, unsigned temp_count, + pan_liveness_update callback) { - bool progress = false; + bool progress = false; - liveness_block_live_out(blk, temp_count); + liveness_block_live_out(blk, temp_count); - uint16_t *live = ralloc_array(blk, uint16_t, temp_count); - memcpy(live, blk->live_out, temp_count * sizeof(uint16_t)); + uint16_t *live = ralloc_array(blk, uint16_t, temp_count); + memcpy(live, blk->live_out, temp_count * sizeof(uint16_t)); - pan_foreach_instr_in_block_rev(blk, ins) - callback(live, (void *) ins, temp_count); + pan_foreach_instr_in_block_rev(blk, ins) + callback(live, (void *)ins, temp_count); - /* To figure out progress, diff live_in */ + /* To figure out progress, diff live_in */ - for (unsigned i = 0; (i < temp_count) && !progress; ++i) - progress |= (blk->live_in[i] != live[i]); + for (unsigned i = 0; (i < temp_count) && !progress; ++i) + progress |= (blk->live_in[i] != live[i]); - ralloc_free(blk->live_in); - blk->live_in = live; + ralloc_free(blk->live_in); + blk->live_in = live; - return progress; + return progress; } - /* Globally, liveness analysis uses a fixed-point algorithm based on a * worklist. We initialize a work list with the exit block. We iterate the work * list to compute live_in from live_out for each block on the work list, @@ -105,70 +103,66 @@ liveness_block_update( */ void -pan_compute_liveness( - struct list_head *blocks, - unsigned temp_count, - pan_liveness_update callback) +pan_compute_liveness(struct list_head *blocks, unsigned temp_count, + pan_liveness_update callback) { - /* Set of pan_block */ - struct set *work_list = _mesa_set_create(NULL, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + /* Set of pan_block */ + struct set *work_list = + _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - struct set *visited = _mesa_set_create(NULL, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + struct set *visited = + _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - /* Free any previous liveness, and allocate */ + /* Free any previous liveness, and allocate */ - pan_free_liveness(blocks); + pan_free_liveness(blocks); - list_for_each_entry(pan_block, block, blocks, link) { - block->live_in = rzalloc_array(block, uint16_t, temp_count); - block->live_out = rzalloc_array(block, uint16_t, temp_count); - } + list_for_each_entry(pan_block, block, blocks, link) { + block->live_in = rzalloc_array(block, uint16_t, temp_count); + block->live_out = rzalloc_array(block, uint16_t, temp_count); + } - /* Initialize the work list with the exit block */ - struct set_entry *cur; + /* Initialize the work list with the exit block */ + struct set_entry *cur; - cur = _mesa_set_add(work_list, pan_exit_block(blocks)); + cur = _mesa_set_add(work_list, pan_exit_block(blocks)); - /* Iterate the work list */ + /* Iterate the work list */ - do { - /* Pop off a block */ - pan_block *blk = (struct pan_block *) cur->key; - _mesa_set_remove(work_list, cur); + do { + /* Pop off a block */ + pan_block *blk = (struct pan_block *)cur->key; + _mesa_set_remove(work_list, cur); - /* Update its liveness information */ - bool progress = liveness_block_update(blk, temp_count, callback); + /* Update its liveness information */ + bool progress = liveness_block_update(blk, temp_count, callback); - /* If we made progress, we need to process the predecessors */ + /* If we made progress, we need to process the predecessors */ - if (progress || !_mesa_set_search(visited, blk)) { - pan_foreach_predecessor(blk, pred) - _mesa_set_add(work_list, pred); - } + if (progress || !_mesa_set_search(visited, blk)) { + pan_foreach_predecessor(blk, pred) + _mesa_set_add(work_list, pred); + } - _mesa_set_add(visited, blk); - } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); + _mesa_set_add(visited, blk); + } while ((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); - _mesa_set_destroy(visited, NULL); - _mesa_set_destroy(work_list, NULL); + _mesa_set_destroy(visited, NULL); + _mesa_set_destroy(work_list, NULL); } void pan_free_liveness(struct list_head *blocks) { - list_for_each_entry(pan_block, block, blocks, link) { - if (block->live_in) - ralloc_free(block->live_in); + list_for_each_entry(pan_block, block, blocks, link) { + if (block->live_in) + ralloc_free(block->live_in); - if (block->live_out) - ralloc_free(block->live_out); + if (block->live_out) + ralloc_free(block->live_out); - block->live_in = NULL; - block->live_out = NULL; - } + block->live_in = NULL; + block->live_out = NULL; + } } diff --git a/src/panfrost/util/pan_lower_64bit_intrin.c b/src/panfrost/util/pan_lower_64bit_intrin.c index 7c4edcfa9d7..3730e7660c8 100644 --- a/src/panfrost/util/pan_lower_64bit_intrin.c +++ b/src/panfrost/util/pan_lower_64bit_intrin.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" /* OpenCL uses 64-bit types for some intrinsic functions, including * global_invocation_id(). This could be worked around during conversion to @@ -36,43 +36,41 @@ static bool nir_lower_64bit_intrin_instr(nir_builder *b, nir_instr *instr, void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - switch (intr->intrinsic) { - case nir_intrinsic_load_global_invocation_id: - case nir_intrinsic_load_global_invocation_id_zero_base: - case nir_intrinsic_load_workgroup_id: - case nir_intrinsic_load_num_workgroups: - break; + switch (intr->intrinsic) { + case nir_intrinsic_load_global_invocation_id: + case nir_intrinsic_load_global_invocation_id_zero_base: + case nir_intrinsic_load_workgroup_id: + case nir_intrinsic_load_num_workgroups: + break; - default: - return false; - } + default: + return false; + } - if (nir_dest_bit_size(intr->dest) != 64) - return false; + if (nir_dest_bit_size(intr->dest) != 64) + return false; - b->cursor = nir_after_instr(instr); + b->cursor = nir_after_instr(instr); - assert(intr->dest.is_ssa); - intr->dest.ssa.bit_size = 32; + assert(intr->dest.is_ssa); + intr->dest.ssa.bit_size = 32; - nir_ssa_def *conv = nir_u2u64(b, &intr->dest.ssa); + nir_ssa_def *conv = nir_u2u64(b, &intr->dest.ssa); - nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, conv, - conv->parent_instr); + nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, conv, conv->parent_instr); - return true; + return true; } bool pan_nir_lower_64bit_intrin(nir_shader *shader) { - return nir_shader_instructions_pass(shader, - nir_lower_64bit_intrin_instr, - nir_metadata_block_index | nir_metadata_dominance, - NULL); + return nir_shader_instructions_pass( + shader, nir_lower_64bit_intrin_instr, + nir_metadata_block_index | nir_metadata_dominance, NULL); } diff --git a/src/panfrost/util/pan_lower_framebuffer.c b/src/panfrost/util/pan_lower_framebuffer.c index b03090202f3..aacb9ce30d4 100644 --- a/src/panfrost/util/pan_lower_framebuffer.c +++ b/src/panfrost/util/pan_lower_framebuffer.c @@ -47,11 +47,11 @@ * smallest precision necessary to store the pixel losslessly. */ +#include "pan_lower_framebuffer.h" #include "compiler/nir/nir.h" #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_format_convert.h" #include "util/format/u_format.h" -#include "pan_lower_framebuffer.h" /* Determines the unpacked type best suiting a given format, so the rest of the * pipeline may be adjusted accordingly */ @@ -59,54 +59,54 @@ nir_alu_type pan_unpacked_type_for_format(const struct util_format_description *desc) { - int c = util_format_get_first_non_void_channel(desc->format); + int c = util_format_get_first_non_void_channel(desc->format); - if (c == -1) - unreachable("Void format not renderable"); + if (c == -1) + unreachable("Void format not renderable"); - bool large = (desc->channel[c].size > 16); - bool large_norm = (desc->channel[c].size > 8); - bool bit8 = (desc->channel[c].size == 8); - assert(desc->channel[c].size <= 32); + bool large = (desc->channel[c].size > 16); + bool large_norm = (desc->channel[c].size > 8); + bool bit8 = (desc->channel[c].size == 8); + assert(desc->channel[c].size <= 32); - if (desc->channel[c].normalized) - return large_norm ? nir_type_float32 : nir_type_float16; + if (desc->channel[c].normalized) + return large_norm ? nir_type_float32 : nir_type_float16; - switch (desc->channel[c].type) { - case UTIL_FORMAT_TYPE_UNSIGNED: - return bit8 ? nir_type_uint8 : - large ? nir_type_uint32 : nir_type_uint16; - case UTIL_FORMAT_TYPE_SIGNED: - return bit8 ? nir_type_int8 : - large ? nir_type_int32 : nir_type_int16; - case UTIL_FORMAT_TYPE_FLOAT: - return large ? nir_type_float32 : nir_type_float16; - default: - unreachable("Format not renderable"); - } + switch (desc->channel[c].type) { + case UTIL_FORMAT_TYPE_UNSIGNED: + return bit8 ? nir_type_uint8 : large ? nir_type_uint32 : nir_type_uint16; + case UTIL_FORMAT_TYPE_SIGNED: + return bit8 ? nir_type_int8 : large ? nir_type_int32 : nir_type_int16; + case UTIL_FORMAT_TYPE_FLOAT: + return large ? nir_type_float32 : nir_type_float16; + default: + unreachable("Format not renderable"); + } } static bool -pan_is_format_native(const struct util_format_description *desc, bool broken_ld_special, bool is_store) +pan_is_format_native(const struct util_format_description *desc, + bool broken_ld_special, bool is_store) { - if (is_store || broken_ld_special) - return false; + if (is_store || broken_ld_special) + return false; - if (util_format_is_pure_integer(desc->format) || util_format_is_float(desc->format)) - return false; + if (util_format_is_pure_integer(desc->format) || + util_format_is_float(desc->format)) + return false; - /* Some formats are missing as typed but have unpacks */ - if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) - return false; + /* Some formats are missing as typed but have unpacks */ + if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) + return false; - if (desc->is_array) { - int c = util_format_get_first_non_void_channel(desc->format); - assert(c >= 0); - if (desc->channel[c].size > 8) - return false; - } + if (desc->is_array) { + int c = util_format_get_first_non_void_channel(desc->format); + assert(c >= 0); + if (desc->channel[c].size > 8) + return false; + } - return true; + return true; } /* Software packs/unpacks, by format class. Packs take in the pixel value typed @@ -117,12 +117,12 @@ pan_is_format_native(const struct util_format_description *desc, bool broken_ld_ static nir_ssa_def * pan_replicate(nir_builder *b, nir_ssa_def *v, unsigned num_components) { - nir_ssa_def *replicated[4]; + nir_ssa_def *replicated[4]; - for (unsigned i = 0; i < 4; ++i) - replicated[i] = nir_channel(b, v, i % num_components); + for (unsigned i = 0; i < 4; ++i) + replicated[i] = nir_channel(b, v, i % num_components); - return nir_vec(b, replicated, 4); + return nir_vec(b, replicated, 4); } /* Pure x16 formats are x16 unpacked, so it's similar, but we need to pack @@ -131,135 +131,129 @@ pan_replicate(nir_builder *b, nir_ssa_def *v, unsigned num_components) static nir_ssa_def * pan_pack_pure_16(nir_builder *b, nir_ssa_def *v, unsigned num_components) { - nir_ssa_def *v4 = pan_replicate(b, v, num_components); + nir_ssa_def *v4 = pan_replicate(b, v, num_components); - nir_ssa_def *lo = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 0)); - nir_ssa_def *hi = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 2)); + nir_ssa_def *lo = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 0)); + nir_ssa_def *hi = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 2)); - return nir_vec4(b, lo, hi, lo, hi); + return nir_vec4(b, lo, hi, lo, hi); } static nir_ssa_def * pan_unpack_pure_16(nir_builder *b, nir_ssa_def *pack, unsigned num_components) { - nir_ssa_def *unpacked[4]; + nir_ssa_def *unpacked[4]; - assert(num_components <= 4); + assert(num_components <= 4); - for (unsigned i = 0; i < num_components; i += 2) { - nir_ssa_def *halves = - nir_unpack_32_2x16(b, nir_channel(b, pack, i >> 1)); + for (unsigned i = 0; i < num_components; i += 2) { + nir_ssa_def *halves = nir_unpack_32_2x16(b, nir_channel(b, pack, i >> 1)); - unpacked[i + 0] = nir_channel(b, halves, 0); - unpacked[i + 1] = nir_channel(b, halves, 1); - } + unpacked[i + 0] = nir_channel(b, halves, 0); + unpacked[i + 1] = nir_channel(b, halves, 1); + } - return nir_pad_vec4(b, nir_vec(b, unpacked, num_components)); + return nir_pad_vec4(b, nir_vec(b, unpacked, num_components)); } static nir_ssa_def * -pan_pack_reorder(nir_builder *b, - const struct util_format_description *desc, +pan_pack_reorder(nir_builder *b, const struct util_format_description *desc, nir_ssa_def *v) { - unsigned swizzle[4] = { 0, 1, 2, 3 }; + unsigned swizzle[4] = {0, 1, 2, 3}; - for (unsigned i = 0; i < v->num_components; i++) { - if (desc->swizzle[i] <= PIPE_SWIZZLE_W) - swizzle[i] = desc->swizzle[i]; - } + for (unsigned i = 0; i < v->num_components; i++) { + if (desc->swizzle[i] <= PIPE_SWIZZLE_W) + swizzle[i] = desc->swizzle[i]; + } - return nir_swizzle(b, v, swizzle, v->num_components); + return nir_swizzle(b, v, swizzle, v->num_components); } static nir_ssa_def * -pan_unpack_reorder(nir_builder *b, - const struct util_format_description *desc, +pan_unpack_reorder(nir_builder *b, const struct util_format_description *desc, nir_ssa_def *v) { - unsigned swizzle[4] = { 0, 1, 2, 3 }; + unsigned swizzle[4] = {0, 1, 2, 3}; - for (unsigned i = 0; i < v->num_components; i++) { - if (desc->swizzle[i] <= PIPE_SWIZZLE_W) - swizzle[desc->swizzle[i]] = i; - } + for (unsigned i = 0; i < v->num_components; i++) { + if (desc->swizzle[i] <= PIPE_SWIZZLE_W) + swizzle[desc->swizzle[i]] = i; + } - return nir_swizzle(b, v, swizzle, v->num_components); + return nir_swizzle(b, v, swizzle, v->num_components); } static nir_ssa_def * pan_replicate_4(nir_builder *b, nir_ssa_def *v) { - return nir_vec4(b, v, v, v, v); + return nir_vec4(b, v, v, v, v); } static nir_ssa_def * pan_pack_pure_8(nir_builder *b, nir_ssa_def *v, unsigned num_components) { - return pan_replicate_4(b, nir_pack_32_4x8(b, pan_replicate(b, v, num_components))); + return pan_replicate_4( + b, nir_pack_32_4x8(b, pan_replicate(b, v, num_components))); } static nir_ssa_def * pan_unpack_pure_8(nir_builder *b, nir_ssa_def *pack, unsigned num_components) { - nir_ssa_def *unpacked = nir_unpack_32_4x8(b, nir_channel(b, pack, 0)); - return nir_channels(b, unpacked, (1 << num_components) - 1); + nir_ssa_def *unpacked = nir_unpack_32_4x8(b, nir_channel(b, pack, 0)); + return nir_channels(b, unpacked, (1 << num_components) - 1); } static nir_ssa_def * pan_fsat(nir_builder *b, nir_ssa_def *v, bool is_signed) { - if (is_signed) - return nir_fsat_signed_mali(b, v); - else - return nir_fsat(b, v); + if (is_signed) + return nir_fsat_signed_mali(b, v); + else + return nir_fsat(b, v); } static float norm_scale(bool snorm, unsigned bits) { - if (snorm) - return (1 << (bits - 1)) - 1; - else - return (1 << bits) - 1; + if (snorm) + return (1 << (bits - 1)) - 1; + else + return (1 << bits) - 1; } /* For <= 8-bits per channel, [U,S]NORM formats are packed like [U,S]NORM 8, * with zeroes spacing out each component as needed */ static nir_ssa_def * -pan_pack_norm(nir_builder *b, nir_ssa_def *v, - unsigned x, unsigned y, unsigned z, unsigned w, - bool is_signed) +pan_pack_norm(nir_builder *b, nir_ssa_def *v, unsigned x, unsigned y, + unsigned z, unsigned w, bool is_signed) { - /* If a channel has N bits, 1.0 is encoded as 2^N - 1 for UNORMs and - * 2^(N-1) - 1 for SNORMs */ - nir_ssa_def *scales = - is_signed ? - nir_imm_vec4_16(b, - (1 << (x - 1)) - 1, (1 << (y - 1)) - 1, - (1 << (z - 1)) - 1, (1 << (w - 1)) - 1) : - nir_imm_vec4_16(b, - (1 << x) - 1, (1 << y) - 1, - (1 << z) - 1, (1 << w) - 1); + /* If a channel has N bits, 1.0 is encoded as 2^N - 1 for UNORMs and + * 2^(N-1) - 1 for SNORMs */ + nir_ssa_def *scales = + is_signed ? nir_imm_vec4_16(b, (1 << (x - 1)) - 1, (1 << (y - 1)) - 1, + (1 << (z - 1)) - 1, (1 << (w - 1)) - 1) + : nir_imm_vec4_16(b, (1 << x) - 1, (1 << y) - 1, (1 << z) - 1, + (1 << w) - 1); - /* If a channel has N bits, we pad out to the byte by (8 - N) bits */ - nir_ssa_def *shifts = nir_imm_ivec4(b, 8 - x, 8 - y, 8 - z, 8 - w); - nir_ssa_def *clamped = pan_fsat(b, nir_pad_vec4(b, v), is_signed); + /* If a channel has N bits, we pad out to the byte by (8 - N) bits */ + nir_ssa_def *shifts = nir_imm_ivec4(b, 8 - x, 8 - y, 8 - z, 8 - w); + nir_ssa_def *clamped = pan_fsat(b, nir_pad_vec4(b, v), is_signed); - nir_ssa_def *f = nir_fmul(b, clamped, scales); - nir_ssa_def *u8 = nir_f2u8(b, nir_fround_even(b, f)); - nir_ssa_def *s = nir_ishl(b, u8, shifts); - nir_ssa_def *repl = nir_pack_32_4x8(b, s); + nir_ssa_def *f = nir_fmul(b, clamped, scales); + nir_ssa_def *u8 = nir_f2u8(b, nir_fround_even(b, f)); + nir_ssa_def *s = nir_ishl(b, u8, shifts); + nir_ssa_def *repl = nir_pack_32_4x8(b, s); - return pan_replicate_4(b, repl); + return pan_replicate_4(b, repl); } static nir_ssa_def * -pan_pack_unorm(nir_builder *b, nir_ssa_def *v, - unsigned x, unsigned y, unsigned z, unsigned w) +pan_pack_unorm(nir_builder *b, nir_ssa_def *v, unsigned x, unsigned y, + unsigned z, unsigned w) { - return pan_pack_norm(b, v, x, y, z, w, false); + return pan_pack_norm(b, v, x, y, z, w, false); } /* RGB10_A2 is packed in the tilebuffer as the bottom 3 bytes being the top @@ -269,25 +263,26 @@ pan_pack_unorm(nir_builder *b, nir_ssa_def *v, static nir_ssa_def * pan_pack_unorm_1010102(nir_builder *b, nir_ssa_def *v) { - nir_ssa_def *scale = nir_imm_vec4(b, 1023.0, 1023.0, 1023.0, 3.0); - nir_ssa_def *s = nir_f2u32(b, nir_fround_even(b, nir_fmul(b, nir_fsat(b, v), scale))); + nir_ssa_def *scale = nir_imm_vec4(b, 1023.0, 1023.0, 1023.0, 3.0); + nir_ssa_def *s = + nir_f2u32(b, nir_fround_even(b, nir_fmul(b, nir_fsat(b, v), scale))); - nir_ssa_def *top8 = nir_ushr(b, s, nir_imm_ivec4(b, 0x2, 0x2, 0x2, 0x2)); - nir_ssa_def *top8_rgb = nir_pack_32_4x8(b, nir_u2u8(b, top8)); + nir_ssa_def *top8 = nir_ushr(b, s, nir_imm_ivec4(b, 0x2, 0x2, 0x2, 0x2)); + nir_ssa_def *top8_rgb = nir_pack_32_4x8(b, nir_u2u8(b, top8)); - nir_ssa_def *bottom2 = nir_iand(b, s, nir_imm_ivec4(b, 0x3, 0x3, 0x3, 0x3)); + nir_ssa_def *bottom2 = nir_iand(b, s, nir_imm_ivec4(b, 0x3, 0x3, 0x3, 0x3)); - nir_ssa_def *top = - nir_ior(b, - nir_ior(b, - nir_ishl(b, nir_channel(b, bottom2, 0), nir_imm_int(b, 24 + 0)), - nir_ishl(b, nir_channel(b, bottom2, 1), nir_imm_int(b, 24 + 2))), - nir_ior(b, - nir_ishl(b, nir_channel(b, bottom2, 2), nir_imm_int(b, 24 + 4)), - nir_ishl(b, nir_channel(b, bottom2, 3), nir_imm_int(b, 24 + 6)))); + nir_ssa_def *top = nir_ior( + b, + nir_ior(b, + nir_ishl(b, nir_channel(b, bottom2, 0), nir_imm_int(b, 24 + 0)), + nir_ishl(b, nir_channel(b, bottom2, 1), nir_imm_int(b, 24 + 2))), + nir_ior(b, + nir_ishl(b, nir_channel(b, bottom2, 2), nir_imm_int(b, 24 + 4)), + nir_ishl(b, nir_channel(b, bottom2, 3), nir_imm_int(b, 24 + 6)))); - nir_ssa_def *p = nir_ior(b, top, top8_rgb); - return pan_replicate_4(b, p); + nir_ssa_def *p = nir_ior(b, top, top8_rgb); + return pan_replicate_4(b, p); } /* On the other hand, the pure int RGB10_A2 is identical to the spec */ @@ -295,41 +290,40 @@ pan_pack_unorm_1010102(nir_builder *b, nir_ssa_def *v) static nir_ssa_def * pan_pack_int_1010102(nir_builder *b, nir_ssa_def *v, bool is_signed) { - v = nir_u2u32(b, v); + v = nir_u2u32(b, v); - /* Clamp the values */ - if (is_signed) { - v = nir_imin(b, v, nir_imm_ivec4(b, 511, 511, 511, 1)); - v = nir_imax(b, v, nir_imm_ivec4(b, -512, -512, -512, -2)); - } else { - v = nir_umin(b, v, nir_imm_ivec4(b, 1023, 1023, 1023, 3)); - } + /* Clamp the values */ + if (is_signed) { + v = nir_imin(b, v, nir_imm_ivec4(b, 511, 511, 511, 1)); + v = nir_imax(b, v, nir_imm_ivec4(b, -512, -512, -512, -2)); + } else { + v = nir_umin(b, v, nir_imm_ivec4(b, 1023, 1023, 1023, 3)); + } - v = nir_ishl(b, v, nir_imm_ivec4(b, 0, 10, 20, 30)); - v = nir_ior(b, - nir_ior(b, nir_channel(b, v, 0), nir_channel(b, v, 1)), - nir_ior(b, nir_channel(b, v, 2), nir_channel(b, v, 3))); + v = nir_ishl(b, v, nir_imm_ivec4(b, 0, 10, 20, 30)); + v = nir_ior(b, nir_ior(b, nir_channel(b, v, 0), nir_channel(b, v, 1)), + nir_ior(b, nir_channel(b, v, 2), nir_channel(b, v, 3))); - return pan_replicate_4(b, v); + return pan_replicate_4(b, v); } static nir_ssa_def * pan_unpack_int_1010102(nir_builder *b, nir_ssa_def *packed, bool is_signed) { - nir_ssa_def *v = pan_replicate_4(b, nir_channel(b, packed, 0)); + nir_ssa_def *v = pan_replicate_4(b, nir_channel(b, packed, 0)); - /* Left shift all components so the sign bit is on the MSB, and - * can be extended by ishr(). The ishl()+[u,i]shr() combination - * sets all unused bits to 0 without requiring a mask. - */ - v = nir_ishl(b, v, nir_imm_ivec4(b, 22, 12, 2, 0)); + /* Left shift all components so the sign bit is on the MSB, and + * can be extended by ishr(). The ishl()+[u,i]shr() combination + * sets all unused bits to 0 without requiring a mask. + */ + v = nir_ishl(b, v, nir_imm_ivec4(b, 22, 12, 2, 0)); - if (is_signed) - v = nir_ishr(b, v, nir_imm_ivec4(b, 22, 22, 22, 30)); - else - v = nir_ushr(b, v, nir_imm_ivec4(b, 22, 22, 22, 30)); + if (is_signed) + v = nir_ishr(b, v, nir_imm_ivec4(b, 22, 22, 22, 30)); + else + v = nir_ushr(b, v, nir_imm_ivec4(b, 22, 22, 22, 30)); - return nir_i2i16(b, v); + return nir_i2i16(b, v); } /* NIR means we can *finally* catch a break */ @@ -337,25 +331,21 @@ pan_unpack_int_1010102(nir_builder *b, nir_ssa_def *packed, bool is_signed) static nir_ssa_def * pan_pack_r11g11b10(nir_builder *b, nir_ssa_def *v) { - return pan_replicate_4(b, nir_format_pack_11f11f10f(b, - nir_f2f32(b, v))); + return pan_replicate_4(b, nir_format_pack_11f11f10f(b, nir_f2f32(b, v))); } static nir_ssa_def * pan_unpack_r11g11b10(nir_builder *b, nir_ssa_def *v) { - nir_ssa_def *f32 = nir_format_unpack_11f11f10f(b, nir_channel(b, v, 0)); - nir_ssa_def *f16 = nir_f2fmp(b, f32); + nir_ssa_def *f32 = nir_format_unpack_11f11f10f(b, nir_channel(b, v, 0)); + nir_ssa_def *f16 = nir_f2fmp(b, f32); - /* Extend to vec4 with alpha */ - nir_ssa_def *components[4] = { - nir_channel(b, f16, 0), - nir_channel(b, f16, 1), - nir_channel(b, f16, 2), - nir_imm_float16(b, 1.0) - }; + /* Extend to vec4 with alpha */ + nir_ssa_def *components[4] = {nir_channel(b, f16, 0), nir_channel(b, f16, 1), + nir_channel(b, f16, 2), + nir_imm_float16(b, 1.0)}; - return nir_vec(b, components, 4); + return nir_vec(b, components, 4); } /* Wrapper around sRGB conversion */ @@ -363,300 +353,294 @@ pan_unpack_r11g11b10(nir_builder *b, nir_ssa_def *v) static nir_ssa_def * pan_linear_to_srgb(nir_builder *b, nir_ssa_def *linear) { - nir_ssa_def *rgb = nir_channels(b, linear, 0x7); + nir_ssa_def *rgb = nir_channels(b, linear, 0x7); - /* TODO: fp16 native conversion */ - nir_ssa_def *srgb = nir_f2fmp(b, - nir_format_linear_to_srgb(b, nir_f2f32(b, rgb))); + /* TODO: fp16 native conversion */ + nir_ssa_def *srgb = + nir_f2fmp(b, nir_format_linear_to_srgb(b, nir_f2f32(b, rgb))); - nir_ssa_def *comp[4] = { - nir_channel(b, srgb, 0), - nir_channel(b, srgb, 1), - nir_channel(b, srgb, 2), - nir_channel(b, linear, 3), - }; + nir_ssa_def *comp[4] = { + nir_channel(b, srgb, 0), + nir_channel(b, srgb, 1), + nir_channel(b, srgb, 2), + nir_channel(b, linear, 3), + }; - return nir_vec(b, comp, 4); + return nir_vec(b, comp, 4); } static nir_ssa_def * pan_unpack_pure(nir_builder *b, nir_ssa_def *packed, unsigned size, unsigned nr) { - switch (size) { - case 32: - return nir_trim_vector(b, packed, nr); - case 16: - return pan_unpack_pure_16(b, packed, nr); - case 8: - return pan_unpack_pure_8(b, packed, nr); - default: - unreachable("Unrenderable size"); - } + switch (size) { + case 32: + return nir_trim_vector(b, packed, nr); + case 16: + return pan_unpack_pure_16(b, packed, nr); + case 8: + return pan_unpack_pure_8(b, packed, nr); + default: + unreachable("Unrenderable size"); + } } /* Generic dispatches for un/pack regardless of format */ static nir_ssa_def * -pan_unpack(nir_builder *b, - const struct util_format_description *desc, - nir_ssa_def *packed) +pan_unpack(nir_builder *b, const struct util_format_description *desc, + nir_ssa_def *packed) { - if (desc->is_array) { - int c = util_format_get_first_non_void_channel(desc->format); - assert(c >= 0); - struct util_format_channel_description d = desc->channel[c]; - nir_ssa_def *unpacked = pan_unpack_pure(b, packed, d.size, desc->nr_channels); + if (desc->is_array) { + int c = util_format_get_first_non_void_channel(desc->format); + assert(c >= 0); + struct util_format_channel_description d = desc->channel[c]; + nir_ssa_def *unpacked = + pan_unpack_pure(b, packed, d.size, desc->nr_channels); - /* Normalized formats are unpacked as integers. We need to - * convert to float for the final result. - */ - if (d.normalized) { - bool snorm = desc->is_snorm; - unsigned float_sz = (d.size <= 8 ? 16 : 32); - float multiplier = norm_scale(snorm, d.size); + /* Normalized formats are unpacked as integers. We need to + * convert to float for the final result. + */ + if (d.normalized) { + bool snorm = desc->is_snorm; + unsigned float_sz = (d.size <= 8 ? 16 : 32); + float multiplier = norm_scale(snorm, d.size); - nir_ssa_def *as_float = - snorm ? nir_i2fN(b, unpacked, float_sz) : - nir_u2fN(b, unpacked, float_sz); + nir_ssa_def *as_float = snorm ? nir_i2fN(b, unpacked, float_sz) + : nir_u2fN(b, unpacked, float_sz); - return nir_fmul_imm(b, as_float, 1.0 / multiplier); - } else { - return unpacked; - } - } + return nir_fmul_imm(b, as_float, 1.0 / multiplier); + } else { + return unpacked; + } + } - switch (desc->format) { - case PIPE_FORMAT_R10G10B10A2_UINT: - case PIPE_FORMAT_B10G10R10A2_UINT: - return pan_unpack_int_1010102(b, packed, false); - case PIPE_FORMAT_R10G10B10A2_SINT: - case PIPE_FORMAT_B10G10R10A2_SINT: - return pan_unpack_int_1010102(b, packed, true); - case PIPE_FORMAT_R11G11B10_FLOAT: - return pan_unpack_r11g11b10(b, packed); - default: - break; - } + switch (desc->format) { + case PIPE_FORMAT_R10G10B10A2_UINT: + case PIPE_FORMAT_B10G10R10A2_UINT: + return pan_unpack_int_1010102(b, packed, false); + case PIPE_FORMAT_R10G10B10A2_SINT: + case PIPE_FORMAT_B10G10R10A2_SINT: + return pan_unpack_int_1010102(b, packed, true); + case PIPE_FORMAT_R11G11B10_FLOAT: + return pan_unpack_r11g11b10(b, packed); + default: + break; + } - fprintf(stderr, "%s\n", desc->name); - unreachable("Unknown format"); + fprintf(stderr, "%s\n", desc->name); + unreachable("Unknown format"); } -static nir_ssa_def * -pan_pack(nir_builder *b, - const struct util_format_description *desc, - nir_ssa_def *unpacked) +static nir_ssa_def *pan_pack(nir_builder *b, + const struct util_format_description *desc, + nir_ssa_def * unpacked) { - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) - unpacked = pan_linear_to_srgb(b, unpacked); + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + unpacked = pan_linear_to_srgb(b, unpacked); - if (desc->is_array) { - int c = util_format_get_first_non_void_channel(desc->format); - assert(c >= 0); - struct util_format_channel_description d = desc->channel[c]; + if (desc->is_array) { + int c = util_format_get_first_non_void_channel(desc->format); + assert(c >= 0); + struct util_format_channel_description d = desc->channel[c]; - /* Pure formats are packed as-is */ - nir_ssa_def *raw = unpacked; + /* Pure formats are packed as-is */ + nir_ssa_def *raw = unpacked; - /* Normalized formats get normalized first */ - if (d.normalized) { - bool snorm = desc->is_snorm; - float multiplier = norm_scale(snorm, d.size); - nir_ssa_def *clamped = pan_fsat(b, unpacked, snorm); - nir_ssa_def *normed = nir_fmul_imm(b, clamped, multiplier); + /* Normalized formats get normalized first */ + if (d.normalized) { + bool snorm = desc->is_snorm; + float multiplier = norm_scale(snorm, d.size); + nir_ssa_def *clamped = pan_fsat(b, unpacked, snorm); + nir_ssa_def *normed = nir_fmul_imm(b, clamped, multiplier); - raw = nir_f2uN(b, normed, d.size); - } + raw = nir_f2uN(b, normed, d.size); + } - /* Pack the raw format */ - switch (d.size) { - case 32: - return pan_replicate(b, raw, desc->nr_channels); - case 16: - return pan_pack_pure_16(b, raw, desc->nr_channels); - case 8: - return pan_pack_pure_8(b, raw, desc->nr_channels); - default: - unreachable("Unrenderable size"); - } - } + /* Pack the raw format */ + switch (d.size) { + case 32: + return pan_replicate(b, raw, desc->nr_channels); + case 16: + return pan_pack_pure_16(b, raw, desc->nr_channels); + case 8: + return pan_pack_pure_8(b, raw, desc->nr_channels); + default: + unreachable("Unrenderable size"); + } + } - switch (desc->format) { - case PIPE_FORMAT_B4G4R4A4_UNORM: - case PIPE_FORMAT_B4G4R4X4_UNORM: - case PIPE_FORMAT_A4R4_UNORM: - case PIPE_FORMAT_R4A4_UNORM: - case PIPE_FORMAT_A4B4G4R4_UNORM: - case PIPE_FORMAT_R4G4B4A4_UNORM: - return pan_pack_unorm(b, unpacked, 4, 4, 4, 4); - case PIPE_FORMAT_B5G5R5A1_UNORM: - case PIPE_FORMAT_R5G5B5A1_UNORM: - return pan_pack_unorm(b, unpacked, 5, 6, 5, 1); - case PIPE_FORMAT_R5G6B5_UNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - return pan_pack_unorm(b, unpacked, 5, 6, 5, 0); - case PIPE_FORMAT_R10G10B10A2_UNORM: - case PIPE_FORMAT_B10G10R10A2_UNORM: - return pan_pack_unorm_1010102(b, unpacked); - case PIPE_FORMAT_R10G10B10A2_UINT: - case PIPE_FORMAT_B10G10R10A2_UINT: - return pan_pack_int_1010102(b, unpacked, false); - case PIPE_FORMAT_R10G10B10A2_SINT: - case PIPE_FORMAT_B10G10R10A2_SINT: - return pan_pack_int_1010102(b, unpacked, true); - case PIPE_FORMAT_R11G11B10_FLOAT: - return pan_pack_r11g11b10(b, unpacked); - default: - break; - } + switch (desc->format) { + case PIPE_FORMAT_B4G4R4A4_UNORM: + case PIPE_FORMAT_B4G4R4X4_UNORM: + case PIPE_FORMAT_A4R4_UNORM: + case PIPE_FORMAT_R4A4_UNORM: + case PIPE_FORMAT_A4B4G4R4_UNORM: + case PIPE_FORMAT_R4G4B4A4_UNORM: + return pan_pack_unorm(b, unpacked, 4, 4, 4, 4); + case PIPE_FORMAT_B5G5R5A1_UNORM: + case PIPE_FORMAT_R5G5B5A1_UNORM: + return pan_pack_unorm(b, unpacked, 5, 6, 5, 1); + case PIPE_FORMAT_R5G6B5_UNORM: + case PIPE_FORMAT_B5G6R5_UNORM: + return pan_pack_unorm(b, unpacked, 5, 6, 5, 0); + case PIPE_FORMAT_R10G10B10A2_UNORM: + case PIPE_FORMAT_B10G10R10A2_UNORM: + return pan_pack_unorm_1010102(b, unpacked); + case PIPE_FORMAT_R10G10B10A2_UINT: + case PIPE_FORMAT_B10G10R10A2_UINT: + return pan_pack_int_1010102(b, unpacked, false); + case PIPE_FORMAT_R10G10B10A2_SINT: + case PIPE_FORMAT_B10G10R10A2_SINT: + return pan_pack_int_1010102(b, unpacked, true); + case PIPE_FORMAT_R11G11B10_FLOAT: + return pan_pack_r11g11b10(b, unpacked); + default: + break; + } - fprintf(stderr, "%s\n", desc->name); - unreachable("Unknown format"); + fprintf(stderr, "%s\n", desc->name); + unreachable("Unknown format"); } static void -pan_lower_fb_store(nir_shader *shader, - nir_builder *b, - nir_intrinsic_instr *intr, - const struct util_format_description *desc, - bool reorder_comps) +pan_lower_fb_store(nir_shader *shader, nir_builder *b, + nir_intrinsic_instr *intr, + const struct util_format_description *desc, + bool reorder_comps) { - /* For stores, add conversion before */ - nir_ssa_def *unpacked = - nir_ssa_for_src(b, intr->src[1], intr->num_components); - unpacked = nir_pad_vec4(b, unpacked); + /* For stores, add conversion before */ + nir_ssa_def *unpacked = + nir_ssa_for_src(b, intr->src[1], intr->num_components); + unpacked = nir_pad_vec4(b, unpacked); - /* Re-order the components */ - if (reorder_comps) - unpacked = pan_pack_reorder(b, desc, unpacked); + /* Re-order the components */ + if (reorder_comps) + unpacked = pan_pack_reorder(b, desc, unpacked); - nir_ssa_def *packed = pan_pack(b, desc, unpacked); + nir_ssa_def *packed = pan_pack(b, desc, unpacked); - nir_store_raw_output_pan(b, packed); + nir_store_raw_output_pan(b, packed); } static nir_ssa_def * pan_sample_id(nir_builder *b, int sample) { - return (sample >= 0) ? nir_imm_int(b, sample) : nir_load_sample_id(b); + return (sample >= 0) ? nir_imm_int(b, sample) : nir_load_sample_id(b); } static void -pan_lower_fb_load(nir_shader *shader, - nir_builder *b, - nir_intrinsic_instr *intr, - const struct util_format_description *desc, - bool reorder_comps, - int sample) +pan_lower_fb_load(nir_shader *shader, nir_builder *b, nir_intrinsic_instr *intr, + const struct util_format_description *desc, + bool reorder_comps, int sample) { - nir_io_semantics sem = { - .location = nir_intrinsic_get_var(intr, 0)->data.location, - }; + nir_io_semantics sem = { + .location = nir_intrinsic_get_var(intr, 0)->data.location, + }; - nir_ssa_def *packed = - nir_load_raw_output_pan(b, 4, 32, pan_sample_id(b, sample), - .io_semantics = sem); + nir_ssa_def *packed = nir_load_raw_output_pan( + b, 4, 32, pan_sample_id(b, sample), .io_semantics = sem); - /* Convert the raw value */ - nir_ssa_def *unpacked = pan_unpack(b, desc, packed); + /* Convert the raw value */ + nir_ssa_def *unpacked = pan_unpack(b, desc, packed); - /* Convert to the size of the load intrinsic. - * - * We can assume that the type will match with the framebuffer format: - * - * Page 170 of the PDF of the OpenGL ES 3.0.6 spec says: - * - * If [UNORM or SNORM, convert to fixed-point]; otherwise no type - * conversion is applied. If the values written by the fragment shader - * do not match the format(s) of the corresponding color buffer(s), - * the result is undefined. - */ + /* Convert to the size of the load intrinsic. + * + * We can assume that the type will match with the framebuffer format: + * + * Page 170 of the PDF of the OpenGL ES 3.0.6 spec says: + * + * If [UNORM or SNORM, convert to fixed-point]; otherwise no type + * conversion is applied. If the values written by the fragment shader + * do not match the format(s) of the corresponding color buffer(s), + * the result is undefined. + */ - unsigned bits = nir_dest_bit_size(intr->dest); + unsigned bits = nir_dest_bit_size(intr->dest); - nir_alu_type src_type = nir_alu_type_get_base_type( - pan_unpacked_type_for_format(desc)); + nir_alu_type src_type = + nir_alu_type_get_base_type(pan_unpacked_type_for_format(desc)); - unpacked = nir_convert_to_bit_size(b, unpacked, src_type, bits); - unpacked = nir_resize_vector(b, unpacked, intr->dest.ssa.num_components); + unpacked = nir_convert_to_bit_size(b, unpacked, src_type, bits); + unpacked = nir_resize_vector(b, unpacked, intr->dest.ssa.num_components); - /* Reorder the components */ - if (reorder_comps) - unpacked = pan_unpack_reorder(b, desc, unpacked); + /* Reorder the components */ + if (reorder_comps) + unpacked = pan_unpack_reorder(b, desc, unpacked); - nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, unpacked, &intr->instr); + nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, unpacked, &intr->instr); } bool pan_lower_framebuffer(nir_shader *shader, const enum pipe_format *rt_fmts, - uint8_t raw_fmt_mask, bool is_blend, bool broken_ld_special) + uint8_t raw_fmt_mask, bool is_blend, + bool broken_ld_special) { - if (shader->info.stage != MESA_SHADER_FRAGMENT) - return false; + if (shader->info.stage != MESA_SHADER_FRAGMENT) + return false; - bool progress = false; + bool progress = false; - nir_foreach_function(func, shader) { - nir_foreach_block(block, func->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; + nir_foreach_function(func, shader) { + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - bool is_load = intr->intrinsic == nir_intrinsic_load_deref; - bool is_store = intr->intrinsic == nir_intrinsic_store_deref; + bool is_load = intr->intrinsic == nir_intrinsic_load_deref; + bool is_store = intr->intrinsic == nir_intrinsic_store_deref; - if (!(is_load || (is_store && is_blend))) - continue; + if (!(is_load || (is_store && is_blend))) + continue; - nir_variable *var = nir_intrinsic_get_var(intr, 0); + nir_variable *var = nir_intrinsic_get_var(intr, 0); - if (var->data.mode != nir_var_shader_out) - continue; + if (var->data.mode != nir_var_shader_out) + continue; - if (var->data.location < FRAG_RESULT_DATA0) - continue; + if (var->data.location < FRAG_RESULT_DATA0) + continue; - unsigned rt = var->data.location - FRAG_RESULT_DATA0; + unsigned rt = var->data.location - FRAG_RESULT_DATA0; - if (rt_fmts[rt] == PIPE_FORMAT_NONE) - continue; + if (rt_fmts[rt] == PIPE_FORMAT_NONE) + continue; - const struct util_format_description *desc = - util_format_description(rt_fmts[rt]); + const struct util_format_description *desc = + util_format_description(rt_fmts[rt]); - /* Don't lower */ - if (pan_is_format_native(desc, broken_ld_special, is_store)) - continue; + /* Don't lower */ + if (pan_is_format_native(desc, broken_ld_special, is_store)) + continue; - /* EXT_shader_framebuffer_fetch requires - * per-sample loads. - * MSAA blend shaders are not yet handled, so - * for now always load sample 0. */ - int sample = is_blend ? 0 : -1; - bool reorder_comps = raw_fmt_mask & BITFIELD_BIT(rt); + /* EXT_shader_framebuffer_fetch requires + * per-sample loads. + * MSAA blend shaders are not yet handled, so + * for now always load sample 0. */ + int sample = is_blend ? 0 : -1; + bool reorder_comps = raw_fmt_mask & BITFIELD_BIT(rt); - nir_builder b; - nir_builder_init(&b, func->impl); + nir_builder b; + nir_builder_init(&b, func->impl); - if (is_store) { - b.cursor = nir_before_instr(instr); - pan_lower_fb_store(shader, &b, intr, desc, reorder_comps); - } else { - b.cursor = nir_after_instr(instr); - pan_lower_fb_load(shader, &b, intr, desc, reorder_comps, sample); - } + if (is_store) { + b.cursor = nir_before_instr(instr); + pan_lower_fb_store(shader, &b, intr, desc, reorder_comps); + } else { + b.cursor = nir_after_instr(instr); + pan_lower_fb_load(shader, &b, intr, desc, reorder_comps, sample); + } - nir_instr_remove(instr); + nir_instr_remove(instr); - progress = true; - } - } + progress = true; + } + } - nir_metadata_preserve(func->impl, nir_metadata_block_index | - nir_metadata_dominance); - } + nir_metadata_preserve(func->impl, + nir_metadata_block_index | nir_metadata_dominance); + } - return progress; + return progress; } diff --git a/src/panfrost/util/pan_lower_framebuffer.h b/src/panfrost/util/pan_lower_framebuffer.h index aab8e4bcdef..96f711a9d43 100644 --- a/src/panfrost/util/pan_lower_framebuffer.h +++ b/src/panfrost/util/pan_lower_framebuffer.h @@ -30,11 +30,11 @@ #include "compiler/nir/nir.h" #include "util/format/u_format.h" -nir_alu_type pan_unpacked_type_for_format(const struct util_format_description *desc); +nir_alu_type +pan_unpacked_type_for_format(const struct util_format_description *desc); -bool pan_lower_framebuffer(nir_shader *shader, - const enum pipe_format *rt_fmts, - uint8_t raw_fmt_mask, - bool is_blend, bool broken_ld_special); +bool pan_lower_framebuffer(nir_shader *shader, const enum pipe_format *rt_fmts, + uint8_t raw_fmt_mask, bool is_blend, + bool broken_ld_special); #endif diff --git a/src/panfrost/util/pan_lower_helper_invocation.c b/src/panfrost/util/pan_lower_helper_invocation.c index 23a37a15dd3..9e31414a262 100644 --- a/src/panfrost/util/pan_lower_helper_invocation.c +++ b/src/panfrost/util/pan_lower_helper_invocation.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" /* Lower gl_HelperInvocation to (gl_SampleMaskIn == 0), this depends on * architectural details but is required for correct operation with @@ -32,27 +32,26 @@ static bool pan_lower_helper_invocation_instr(nir_builder *b, nir_instr *instr, void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_load_helper_invocation) - return false; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_load_helper_invocation) + return false; - b->cursor = nir_before_instr(instr); + b->cursor = nir_before_instr(instr); - nir_ssa_def *mask = nir_load_sample_mask_in(b); - nir_ssa_def *eq = nir_ieq(b, mask, nir_imm_int(b, 0)); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, eq); + nir_ssa_def *mask = nir_load_sample_mask_in(b); + nir_ssa_def *eq = nir_ieq(b, mask, nir_imm_int(b, 0)); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, eq); - return true; + return true; } bool pan_lower_helper_invocation(nir_shader *shader) { - return nir_shader_instructions_pass(shader, - pan_lower_helper_invocation_instr, - nir_metadata_block_index | nir_metadata_dominance, - NULL); + return nir_shader_instructions_pass( + shader, pan_lower_helper_invocation_instr, + nir_metadata_block_index | nir_metadata_dominance, NULL); } diff --git a/src/panfrost/util/pan_lower_sample_position.c b/src/panfrost/util/pan_lower_sample_position.c index 12a0c47bbff..f51b9c9689c 100644 --- a/src/panfrost/util/pan_lower_sample_position.c +++ b/src/panfrost/util/pan_lower_sample_position.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" /* Sample positions are supplied in a packed 8:8 fixed-point vec2 format in GPU * memory indexed by the sample. We lower in NIR to take advantage of possible @@ -33,43 +33,42 @@ * it's a pretty trivial difference */ static bool -pan_lower_sample_pos_impl(struct nir_builder *b, - nir_instr *instr, UNUSED void *data) +pan_lower_sample_pos_impl(struct nir_builder *b, nir_instr *instr, + UNUSED void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_load_sample_pos) - return false; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_load_sample_pos) + return false; - b->cursor = nir_before_instr(instr); + b->cursor = nir_before_instr(instr); - /* Elements are 4 bytes */ - nir_ssa_def *addr = nir_iadd(b, - nir_load_sample_positions_pan(b), - nir_u2u64(b, nir_imul_imm(b, nir_load_sample_id(b), 4))); + /* Elements are 4 bytes */ + nir_ssa_def *addr = + nir_iadd(b, nir_load_sample_positions_pan(b), + nir_u2u64(b, nir_imul_imm(b, nir_load_sample_id(b), 4))); - /* Decode 8:8 fixed-point */ - nir_ssa_def *raw = nir_load_global(b, addr, 2, 2, 16); - nir_ssa_def *decoded = nir_fmul_imm(b, nir_i2f16(b, raw), 1.0 / 256.0); + /* Decode 8:8 fixed-point */ + nir_ssa_def *raw = nir_load_global(b, addr, 2, 2, 16); + nir_ssa_def *decoded = nir_fmul_imm(b, nir_i2f16(b, raw), 1.0 / 256.0); - /* Make NIR validator happy */ - if (decoded->bit_size != nir_dest_bit_size(intr->dest)) - decoded = nir_f2fN(b, decoded, nir_dest_bit_size(intr->dest)); + /* Make NIR validator happy */ + if (decoded->bit_size != nir_dest_bit_size(intr->dest)) + decoded = nir_f2fN(b, decoded, nir_dest_bit_size(intr->dest)); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, decoded); - return true; + nir_ssa_def_rewrite_uses(&intr->dest.ssa, decoded); + return true; } bool pan_lower_sample_pos(nir_shader *shader) { - if (shader->info.stage != MESA_SHADER_FRAGMENT) - return false; + if (shader->info.stage != MESA_SHADER_FRAGMENT) + return false; - return nir_shader_instructions_pass(shader, - pan_lower_sample_pos_impl, - nir_metadata_block_index | nir_metadata_dominance, - NULL); + return nir_shader_instructions_pass( + shader, pan_lower_sample_pos_impl, + nir_metadata_block_index | nir_metadata_dominance, NULL); } diff --git a/src/panfrost/util/pan_lower_store_component.c b/src/panfrost/util/pan_lower_store_component.c index 5178317232b..aa00e6118e2 100644 --- a/src/panfrost/util/pan_lower_store_component.c +++ b/src/panfrost/util/pan_lower_store_component.c @@ -24,8 +24,8 @@ * Alyssa Rosenzweig */ -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" /* * If the shader packs multiple varyings into the same location with different @@ -36,70 +36,69 @@ static bool lower_store_component(nir_builder *b, nir_instr *instr, void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - return false; + if (intr->intrinsic != nir_intrinsic_store_output) + return false; - struct hash_table_u64 *slots = data; - unsigned component = nir_intrinsic_component(intr); - nir_src *slot_src = nir_get_io_offset_src(intr); - uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr); + struct hash_table_u64 *slots = data; + unsigned component = nir_intrinsic_component(intr); + nir_src *slot_src = nir_get_io_offset_src(intr); + uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr); - nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot); - unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0); + nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot); + unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0); - nir_ssa_def *value = intr->src[0].ssa; - b->cursor = nir_before_instr(&intr->instr); + nir_ssa_def *value = intr->src[0].ssa; + b->cursor = nir_before_instr(&intr->instr); - nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size); - nir_ssa_def *channels[4] = { undef, undef, undef, undef }; + nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size); + nir_ssa_def *channels[4] = {undef, undef, undef, undef}; - /* Copy old */ - u_foreach_bit(i, mask) { - assert(prev != NULL); - nir_ssa_def *prev_ssa = prev->src[0].ssa; - channels[i] = nir_channel(b, prev_ssa, i); - } + /* Copy old */ + u_foreach_bit(i, mask) { + assert(prev != NULL); + nir_ssa_def *prev_ssa = prev->src[0].ssa; + channels[i] = nir_channel(b, prev_ssa, i); + } - /* Copy new */ - unsigned new_mask = nir_intrinsic_write_mask(intr); - mask |= (new_mask << component); + /* Copy new */ + unsigned new_mask = nir_intrinsic_write_mask(intr); + mask |= (new_mask << component); - u_foreach_bit(i, new_mask) { - assert(component + i < 4); - channels[component + i] = nir_channel(b, value, i); - } + u_foreach_bit(i, new_mask) { + assert(component + i < 4); + channels[component + i] = nir_channel(b, value, i); + } - intr->num_components = util_last_bit(mask); - nir_instr_rewrite_src_ssa(instr, &intr->src[0], - nir_vec(b, channels, intr->num_components)); + intr->num_components = util_last_bit(mask); + nir_instr_rewrite_src_ssa(instr, &intr->src[0], + nir_vec(b, channels, intr->num_components)); - nir_intrinsic_set_component(intr, 0); - nir_intrinsic_set_write_mask(intr, mask); + nir_intrinsic_set_component(intr, 0); + nir_intrinsic_set_write_mask(intr, mask); - if (prev) { - _mesa_hash_table_u64_remove(slots, slot); - nir_instr_remove(&prev->instr); - } + if (prev) { + _mesa_hash_table_u64_remove(slots, slot); + nir_instr_remove(&prev->instr); + } - _mesa_hash_table_u64_insert(slots, slot, intr); - return false; + _mesa_hash_table_u64_insert(slots, slot, intr); + return false; } bool pan_nir_lower_store_component(nir_shader *s) { - assert(s->info.stage == MESA_SHADER_VERTEX); + assert(s->info.stage == MESA_SHADER_VERTEX); - struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL); - bool progress = nir_shader_instructions_pass(s, lower_store_component, - nir_metadata_block_index | - nir_metadata_dominance, - stores); - _mesa_hash_table_u64_destroy(stores); - return progress; + struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL); + bool progress = nir_shader_instructions_pass( + s, lower_store_component, + nir_metadata_block_index | nir_metadata_dominance, stores); + _mesa_hash_table_u64_destroy(stores); + return progress; } diff --git a/src/panfrost/util/pan_lower_writeout.c b/src/panfrost/util/pan_lower_writeout.c index d4099fb5288..bbd4a1f01d9 100644 --- a/src/panfrost/util/pan_lower_writeout.c +++ b/src/panfrost/util/pan_lower_writeout.c @@ -22,8 +22,8 @@ * SOFTWARE. */ -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" /* Midgard can write all of color, depth and stencil in a single writeout * operation, so we merge depth/stencil stores with color stores. @@ -42,150 +42,153 @@ static nir_alu_type pan_nir_rt_store_type(nir_intrinsic_instr *store) { - return store ? nir_intrinsic_src_type(store) : nir_type_float32; + return store ? nir_intrinsic_src_type(store) : nir_type_float32; } static void -pan_nir_emit_combined_store(nir_builder *b, - nir_intrinsic_instr *rt0_store, - unsigned writeout, - nir_intrinsic_instr **stores) +pan_nir_emit_combined_store(nir_builder *b, nir_intrinsic_instr *rt0_store, + unsigned writeout, nir_intrinsic_instr **stores) { - nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_combined_output_pan); + nir_intrinsic_instr *intr = nir_intrinsic_instr_create( + b->shader, nir_intrinsic_store_combined_output_pan); - intr->num_components = rt0_store ? rt0_store->src[0].ssa->num_components : 4; + intr->num_components = rt0_store ? rt0_store->src[0].ssa->num_components : 4; - if (rt0_store) - nir_intrinsic_set_io_semantics(intr, nir_intrinsic_io_semantics(rt0_store)); - nir_intrinsic_set_src_type(intr, pan_nir_rt_store_type(rt0_store)); - nir_intrinsic_set_dest_type(intr, pan_nir_rt_store_type(stores[2])); - nir_intrinsic_set_component(intr, writeout); + if (rt0_store) + nir_intrinsic_set_io_semantics(intr, + nir_intrinsic_io_semantics(rt0_store)); + nir_intrinsic_set_src_type(intr, pan_nir_rt_store_type(rt0_store)); + nir_intrinsic_set_dest_type(intr, pan_nir_rt_store_type(stores[2])); + nir_intrinsic_set_component(intr, writeout); - nir_ssa_def *zero = nir_imm_int(b, 0); - nir_ssa_def *zero4 = nir_imm_ivec4(b, 0, 0, 0, 0); + nir_ssa_def *zero = nir_imm_int(b, 0); + nir_ssa_def *zero4 = nir_imm_ivec4(b, 0, 0, 0, 0); - nir_ssa_def *src[] = { - rt0_store ? rt0_store->src[0].ssa : zero4, - rt0_store ? rt0_store->src[1].ssa : zero, - stores[0] ? stores[0]->src[0].ssa : zero, - stores[1] ? stores[1]->src[0].ssa : zero, - stores[2] ? stores[2]->src[0].ssa : zero4, - }; + nir_ssa_def *src[] = { + rt0_store ? rt0_store->src[0].ssa : zero4, + rt0_store ? rt0_store->src[1].ssa : zero, + stores[0] ? stores[0]->src[0].ssa : zero, + stores[1] ? stores[1]->src[0].ssa : zero, + stores[2] ? stores[2]->src[0].ssa : zero4, + }; - for (int i = 0; i < ARRAY_SIZE(src); ++i) - intr->src[i] = nir_src_for_ssa(src[i]); + for (int i = 0; i < ARRAY_SIZE(src); ++i) + intr->src[i] = nir_src_for_ssa(src[i]); - nir_builder_instr_insert(b, &intr->instr); + nir_builder_instr_insert(b, &intr->instr); } bool pan_nir_lower_zs_store(nir_shader *nir) { - bool progress = false; + bool progress = false; - if (nir->info.stage != MESA_SHADER_FRAGMENT) - return false; + if (nir->info.stage != MESA_SHADER_FRAGMENT) + return false; - nir_foreach_function(function, nir) { - if (!function->impl) continue; + nir_foreach_function(function, nir) { + if (!function->impl) + continue; - nir_intrinsic_instr *stores[3] = { NULL }; - unsigned writeout = 0; + nir_intrinsic_instr *stores[3] = {NULL}; + unsigned writeout = 0; - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - if (sem.location == FRAG_RESULT_DEPTH) { - stores[0] = intr; - writeout |= PAN_WRITEOUT_Z; - } else if (sem.location == FRAG_RESULT_STENCIL) { - stores[1] = intr; - writeout |= PAN_WRITEOUT_S; - } else if (sem.dual_source_blend_index) { - stores[2] = intr; - writeout |= PAN_WRITEOUT_2; - } - } - } + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + if (sem.location == FRAG_RESULT_DEPTH) { + stores[0] = intr; + writeout |= PAN_WRITEOUT_Z; + } else if (sem.location == FRAG_RESULT_STENCIL) { + stores[1] = intr; + writeout |= PAN_WRITEOUT_S; + } else if (sem.dual_source_blend_index) { + stores[2] = intr; + writeout |= PAN_WRITEOUT_2; + } + } + } - if (!writeout) continue; + if (!writeout) + continue; - nir_block *common_block = NULL; + nir_block *common_block = NULL; - /* Ensure all stores are in the same block */ - for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { - if (!stores[i]) - continue; + /* Ensure all stores are in the same block */ + for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { + if (!stores[i]) + continue; - nir_block *block = stores[i]->instr.block; + nir_block *block = stores[i]->instr.block; - if (common_block) - assert(common_block == block); - else - common_block = block; - } + if (common_block) + assert(common_block == block); + else + common_block = block; + } - bool replaced = false; + bool replaced = false; - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - if (sem.location < FRAG_RESULT_DATA0) - continue; + if (sem.location < FRAG_RESULT_DATA0) + continue; - if (sem.dual_source_blend_index) - continue; + if (sem.dual_source_blend_index) + continue; - assert(nir_src_is_const(intr->src[1]) && "no indirect outputs"); + assert(nir_src_is_const(intr->src[1]) && "no indirect outputs"); - nir_builder b; - nir_builder_init(&b, function->impl); - b.cursor = nir_after_block_before_jump(instr->block); + nir_builder b; + nir_builder_init(&b, function->impl); + b.cursor = nir_after_block_before_jump(instr->block); - /* Trying to write depth twice results in the - * wrong blend shader being executed on - * Midgard */ - unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout); + /* Trying to write depth twice results in the + * wrong blend shader being executed on + * Midgard */ + unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout); - pan_nir_emit_combined_store(&b, intr, this_store, stores); + pan_nir_emit_combined_store(&b, intr, this_store, stores); - nir_instr_remove(instr); + nir_instr_remove(instr); - replaced = true; - } - } + replaced = true; + } + } - /* Insert a store to the depth RT (0xff) if needed */ - if (!replaced) { - nir_builder b; - nir_builder_init(&b, function->impl); - b.cursor = nir_after_block_before_jump(common_block); + /* Insert a store to the depth RT (0xff) if needed */ + if (!replaced) { + nir_builder b; + nir_builder_init(&b, function->impl); + b.cursor = nir_after_block_before_jump(common_block); - pan_nir_emit_combined_store(&b, NULL, writeout, stores); - } + pan_nir_emit_combined_store(&b, NULL, writeout, stores); + } - for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { - if (stores[i]) - nir_instr_remove(&stores[i]->instr); - } + for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { + if (stores[i]) + nir_instr_remove(&stores[i]->instr); + } - nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); - progress = true; - } + nir_metadata_preserve(function->impl, + nir_metadata_block_index | nir_metadata_dominance); + progress = true; + } - return progress; + return progress; } diff --git a/src/panfrost/util/pan_lower_xfb.c b/src/panfrost/util/pan_lower_xfb.c index e9620b2e760..3876e60d1d3 100644 --- a/src/panfrost/util/pan_lower_xfb.c +++ b/src/panfrost/util/pan_lower_xfb.c @@ -21,81 +21,77 @@ * SOFTWARE. */ - -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" static void lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr, unsigned start_component, unsigned num_components, unsigned buffer, unsigned offset_words) { - assert(buffer < MAX_XFB_BUFFERS); - assert(nir_intrinsic_component(intr) == 0); // TODO + assert(buffer < MAX_XFB_BUFFERS); + assert(nir_intrinsic_component(intr) == 0); // TODO - /* Transform feedback info in units of words, convert to bytes. */ - uint16_t stride = b->shader->info.xfb_stride[buffer] * 4; - assert(stride != 0); + /* Transform feedback info in units of words, convert to bytes. */ + uint16_t stride = b->shader->info.xfb_stride[buffer] * 4; + assert(stride != 0); - uint16_t offset = offset_words * 4; + uint16_t offset = offset_words * 4; - nir_ssa_def *index = nir_iadd(b, - nir_imul(b, nir_load_instance_id(b), - nir_load_num_vertices(b)), - nir_load_vertex_id_zero_base(b)); + nir_ssa_def *index = nir_iadd( + b, nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)), + nir_load_vertex_id_zero_base(b)); - BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); - BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); + BITSET_SET(b->shader->info.system_values_read, + SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); + BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); - nir_ssa_def *buf = nir_load_xfb_address(b, 64, .base = buffer); - nir_ssa_def *addr = - nir_iadd(b, buf, nir_u2u64(b, - nir_iadd_imm(b, - nir_imul_imm(b, index, stride), - offset))); + nir_ssa_def *buf = nir_load_xfb_address(b, 64, .base = buffer); + nir_ssa_def *addr = nir_iadd( + b, buf, + nir_u2u64(b, nir_iadd_imm(b, nir_imul_imm(b, index, stride), offset))); - assert(intr->src[0].is_ssa && "must lower XFB before lowering SSA"); - nir_ssa_def *src = intr->src[0].ssa; - nir_ssa_def *value = nir_channels(b, src, BITFIELD_MASK(num_components) << start_component); - nir_store_global(b, addr, 4, value, BITFIELD_MASK(num_components)); + assert(intr->src[0].is_ssa && "must lower XFB before lowering SSA"); + nir_ssa_def *src = intr->src[0].ssa; + nir_ssa_def *value = + nir_channels(b, src, BITFIELD_MASK(num_components) << start_component); + nir_store_global(b, addr, 4, value, BITFIELD_MASK(num_components)); } static bool lower_xfb(nir_builder *b, nir_instr *instr, UNUSED void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; + if (instr->type != nir_instr_type_intrinsic) + return false; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - return false; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + return false; - bool progress = false; + bool progress = false; - b->cursor = nir_before_instr(&intr->instr); + b->cursor = nir_before_instr(&intr->instr); - for (unsigned i = 0; i < 2; ++i) { - nir_io_xfb xfb = i ? nir_intrinsic_io_xfb2(intr) : nir_intrinsic_io_xfb(intr); - for (unsigned j = 0; j < 2; ++j) { - if (!xfb.out[j].num_components) continue; + for (unsigned i = 0; i < 2; ++i) { + nir_io_xfb xfb = + i ? nir_intrinsic_io_xfb2(intr) : nir_intrinsic_io_xfb(intr); + for (unsigned j = 0; j < 2; ++j) { + if (!xfb.out[j].num_components) + continue; - lower_xfb_output(b, intr, i*2 + j, - xfb.out[j].num_components, - xfb.out[j].buffer, - xfb.out[j].offset); - progress = true; - } - } + lower_xfb_output(b, intr, i * 2 + j, xfb.out[j].num_components, + xfb.out[j].buffer, xfb.out[j].offset); + progress = true; + } + } - nir_instr_remove(instr); - return progress; + nir_instr_remove(instr); + return progress; } bool pan_lower_xfb(nir_shader *nir) { - return nir_shader_instructions_pass(nir, lower_xfb, - nir_metadata_block_index | - nir_metadata_dominance, NULL); + return nir_shader_instructions_pass( + nir, lower_xfb, nir_metadata_block_index | nir_metadata_dominance, NULL); } - diff --git a/src/panfrost/util/pan_sysval.c b/src/panfrost/util/pan_sysval.c index 78caffb8a55..001e30f9e35 100644 --- a/src/panfrost/util/pan_sysval.c +++ b/src/panfrost/util/pan_sysval.c @@ -24,171 +24,167 @@ * Alyssa Rosenzweig */ -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" /* TODO: ssbo_size */ static int panfrost_sysval_for_ssbo(nir_intrinsic_instr *instr) { - nir_src index = instr->src[0]; - assert(nir_src_is_const(index)); - uint32_t uindex = nir_src_as_uint(index); + nir_src index = instr->src[0]; + assert(nir_src_is_const(index)); + uint32_t uindex = nir_src_as_uint(index); - return PAN_SYSVAL(SSBO, uindex); + return PAN_SYSVAL(SSBO, uindex); } static int panfrost_sysval_for_sampler(nir_intrinsic_instr *instr) { - /* TODO: indirect samplers !!! */ - nir_src index = instr->src[0]; - assert(nir_src_is_const(index)); - uint32_t uindex = nir_src_as_uint(index); + /* TODO: indirect samplers !!! */ + nir_src index = instr->src[0]; + assert(nir_src_is_const(index)); + uint32_t uindex = nir_src_as_uint(index); - return PAN_SYSVAL(SAMPLER, uindex); + return PAN_SYSVAL(SAMPLER, uindex); } static int panfrost_sysval_for_image_size(nir_intrinsic_instr *instr) { - nir_src index = instr->src[0]; - assert(nir_src_is_const(index)); + nir_src index = instr->src[0]; + assert(nir_src_is_const(index)); - bool is_array = nir_intrinsic_image_array(instr); - uint32_t uindex = nir_src_as_uint(index); - unsigned dim = nir_intrinsic_dest_components(instr) - is_array; + bool is_array = nir_intrinsic_image_array(instr); + uint32_t uindex = nir_src_as_uint(index); + unsigned dim = nir_intrinsic_dest_components(instr) - is_array; - return PAN_SYSVAL(IMAGE_SIZE, PAN_TXS_SYSVAL_ID(uindex, dim, is_array)); + return PAN_SYSVAL(IMAGE_SIZE, PAN_TXS_SYSVAL_ID(uindex, dim, is_array)); } static unsigned panfrost_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr) { - switch (instr->intrinsic) { - case nir_intrinsic_load_viewport_scale: - return PAN_SYSVAL_VIEWPORT_SCALE; - case nir_intrinsic_load_viewport_offset: - return PAN_SYSVAL_VIEWPORT_OFFSET; - case nir_intrinsic_load_num_workgroups: - return PAN_SYSVAL_NUM_WORK_GROUPS; - case nir_intrinsic_load_workgroup_size: - return PAN_SYSVAL_LOCAL_GROUP_SIZE; - case nir_intrinsic_load_work_dim: - return PAN_SYSVAL_WORK_DIM; - case nir_intrinsic_load_sample_positions_pan: - return PAN_SYSVAL_SAMPLE_POSITIONS; - case nir_intrinsic_load_first_vertex: - case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_base_instance: - return PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS; - case nir_intrinsic_load_draw_id: - return PAN_SYSVAL_DRAWID; - case nir_intrinsic_load_ssbo_address: - case nir_intrinsic_get_ssbo_size: - return panfrost_sysval_for_ssbo(instr); - case nir_intrinsic_load_xfb_address: - return PAN_SYSVAL(XFB, nir_intrinsic_base(instr)); - case nir_intrinsic_load_num_vertices: - return PAN_SYSVAL_NUM_VERTICES; - case nir_intrinsic_load_sampler_lod_parameters_pan: - return panfrost_sysval_for_sampler(instr); - case nir_intrinsic_image_size: - return panfrost_sysval_for_image_size(instr); - case nir_intrinsic_load_blend_const_color_rgba: - return PAN_SYSVAL_BLEND_CONSTANTS; - default: - return ~0; - } + switch (instr->intrinsic) { + case nir_intrinsic_load_viewport_scale: + return PAN_SYSVAL_VIEWPORT_SCALE; + case nir_intrinsic_load_viewport_offset: + return PAN_SYSVAL_VIEWPORT_OFFSET; + case nir_intrinsic_load_num_workgroups: + return PAN_SYSVAL_NUM_WORK_GROUPS; + case nir_intrinsic_load_workgroup_size: + return PAN_SYSVAL_LOCAL_GROUP_SIZE; + case nir_intrinsic_load_work_dim: + return PAN_SYSVAL_WORK_DIM; + case nir_intrinsic_load_sample_positions_pan: + return PAN_SYSVAL_SAMPLE_POSITIONS; + case nir_intrinsic_load_first_vertex: + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_base_instance: + return PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS; + case nir_intrinsic_load_draw_id: + return PAN_SYSVAL_DRAWID; + case nir_intrinsic_load_ssbo_address: + case nir_intrinsic_get_ssbo_size: + return panfrost_sysval_for_ssbo(instr); + case nir_intrinsic_load_xfb_address: + return PAN_SYSVAL(XFB, nir_intrinsic_base(instr)); + case nir_intrinsic_load_num_vertices: + return PAN_SYSVAL_NUM_VERTICES; + case nir_intrinsic_load_sampler_lod_parameters_pan: + return panfrost_sysval_for_sampler(instr); + case nir_intrinsic_image_size: + return panfrost_sysval_for_image_size(instr); + case nir_intrinsic_load_blend_const_color_rgba: + return PAN_SYSVAL_BLEND_CONSTANTS; + default: + return ~0; + } } int panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest) { - nir_intrinsic_instr *intr; - nir_dest *dst = NULL; - nir_tex_instr *tex; - unsigned sysval = ~0; + nir_intrinsic_instr *intr; + nir_dest *dst = NULL; + nir_tex_instr *tex; + unsigned sysval = ~0; - switch (instr->type) { - case nir_instr_type_intrinsic: - intr = nir_instr_as_intrinsic(instr); - sysval = panfrost_nir_sysval_for_intrinsic(intr); - dst = &intr->dest; - break; - case nir_instr_type_tex: - tex = nir_instr_as_tex(instr); - if (tex->op != nir_texop_txs) - break; + switch (instr->type) { + case nir_instr_type_intrinsic: + intr = nir_instr_as_intrinsic(instr); + sysval = panfrost_nir_sysval_for_intrinsic(intr); + dst = &intr->dest; + break; + case nir_instr_type_tex: + tex = nir_instr_as_tex(instr); + if (tex->op != nir_texop_txs) + break; - sysval = PAN_SYSVAL(TEXTURE_SIZE, - PAN_TXS_SYSVAL_ID(tex->texture_index, - nir_tex_instr_dest_size(tex) - - (tex->is_array ? 1 : 0), - tex->is_array)); - dst = &tex->dest; - break; - default: - break; - } + sysval = PAN_SYSVAL(TEXTURE_SIZE, + PAN_TXS_SYSVAL_ID(tex->texture_index, + nir_tex_instr_dest_size(tex) - + (tex->is_array ? 1 : 0), + tex->is_array)); + dst = &tex->dest; + break; + default: + break; + } - if (dest && dst) - *dest = *dst; + if (dest && dst) + *dest = *dst; - return sysval; + return sysval; } static unsigned pan_add_sysval(struct hash_table_u64 *sysval_to_id, - struct panfrost_sysvals *sysvals, - int sysval, unsigned id) + struct panfrost_sysvals *sysvals, int sysval, unsigned id) { - assert(id < MAX_SYSVAL_COUNT); - _mesa_hash_table_u64_insert(sysval_to_id, sysval, (void *) ((uintptr_t) id + 1)); - sysvals->sysvals[id] = sysval; - return id; + assert(id < MAX_SYSVAL_COUNT); + _mesa_hash_table_u64_insert(sysval_to_id, sysval, + (void *)((uintptr_t)id + 1)); + sysvals->sysvals[id] = sysval; + return id; } unsigned pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, - struct panfrost_sysvals *sysvals, - int sysval) + struct panfrost_sysvals *sysvals, int sysval) { - /* Try to lookup */ + /* Try to lookup */ - void *cached = _mesa_hash_table_u64_search(sysval_to_id, sysval); + void *cached = _mesa_hash_table_u64_search(sysval_to_id, sysval); - if (cached) { - unsigned id = ((uintptr_t) cached) - 1; - assert(id < MAX_SYSVAL_COUNT); - assert(sysvals->sysvals[id] == sysval); - return id; - } + if (cached) { + unsigned id = ((uintptr_t)cached) - 1; + assert(id < MAX_SYSVAL_COUNT); + assert(sysvals->sysvals[id] == sysval); + return id; + } - /* Else assign */ - return pan_add_sysval(sysval_to_id, sysvals, sysval, - sysvals->sysval_count++); + /* Else assign */ + return pan_add_sysval(sysval_to_id, sysvals, sysval, + sysvals->sysval_count++); } struct hash_table_u64 * panfrost_init_sysvals(struct panfrost_sysvals *sysvals, - struct panfrost_sysvals *fixed_sysvals, - void *memctx) + struct panfrost_sysvals *fixed_sysvals, void *memctx) { - memset(sysvals, 0, sizeof(*sysvals)); - struct hash_table_u64 *sysval_to_id = - _mesa_hash_table_u64_create(memctx); + memset(sysvals, 0, sizeof(*sysvals)); + struct hash_table_u64 *sysval_to_id = _mesa_hash_table_u64_create(memctx); - if (fixed_sysvals) { - for (unsigned i = 0; i < fixed_sysvals->sysval_count; i++) { - if (!fixed_sysvals->sysvals[i]) - continue; + if (fixed_sysvals) { + for (unsigned i = 0; i < fixed_sysvals->sysval_count; i++) { + if (!fixed_sysvals->sysvals[i]) + continue; - pan_add_sysval(sysval_to_id, sysvals, - fixed_sysvals->sysvals[i], i); - } - sysvals->sysval_count = fixed_sysvals->sysval_count; - } + pan_add_sysval(sysval_to_id, sysvals, fixed_sysvals->sysvals[i], i); + } + sysvals->sysval_count = fixed_sysvals->sysval_count; + } - return sysval_to_id; + return sysval_to_id; }