tu: Implement VK_QCOM_image_processing.
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

This includes the block matching, box filtering, and weighted sample
features.  Passes all of the dEQP-VK.image_processing.* CTS tests that
were recently landed.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38559>
This commit is contained in:
Emma Anholt 2025-10-09 16:02:56 -07:00 committed by Marge Bot
parent 431c7a6e36
commit 72c12f62ff
17 changed files with 329 additions and 23 deletions

View file

@ -722,6 +722,7 @@ Khronos extensions that are not part of any Vulkan version:
VK_MESA_image_alignment_control DONE (anv, nvk, radv)
VK_EXT_legacy_dithering DONE (anv, tu, vn)
VK_QCOM_fragment_density_map_offset DONE (tu)
VK_QCOM_image_processing DONE (tu)
VK_VALVE_video_encode_rgb_conversion DONE (radv)
Rusticl OpenCL 1.0 -- all DONE:

View file

@ -0,0 +1 @@
VK_QCOM_image_processing on Turnip

View file

@ -467,6 +467,9 @@ struct fd_dev_info {
* expected:
*/
bool has_salu_int_narrowing_quirk;
/* Whether the device supports the image processing opcode */
bool has_image_processing;
} props;
};

View file

@ -1016,6 +1016,7 @@ a7xx_gen2 = GPUProps(
reading_shading_rate_requires_smask_quirk = True,
has_ray_intersection = True,
has_hw_bin_scaling = True,
has_image_processing = True,
)
a7xx_gen3 = GPUProps(
@ -1043,6 +1044,7 @@ a7xx_gen3 = GPUProps(
has_abs_bin_mask = True,
new_control_regs = True,
has_hw_bin_scaling = True,
has_image_processing = True,
)
a730_magic_regs = dict(

View file

@ -133,6 +133,10 @@ fdl6_texswiz(const struct fdl_view_args *args, bool has_z24uint_s8uint)
unsigned char swiz[4];
util_format_compose_swizzles(format_swiz, args->swiz, swiz);
/* Unused for box filter, match the blob behavior. */
if (args->filter_width)
return 0;
if (CHIP <= A7XX) {
return A6XX_TEX_CONST_0_SWIZ_X(fdl6_swiz(swiz[0])) |
A6XX_TEX_CONST_0_SWIZ_Y(fdl6_swiz(swiz[1])) |
@ -258,7 +262,13 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
view->descriptor[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(layer_size);
view->descriptor[4] = base_addr;
view->descriptor[5] = (base_addr >> 32) | A6XX_TEX_CONST_5_DEPTH(depth);
view->descriptor[6] = A6XX_TEX_CONST_6_MIN_LOD_CLAMP(args->min_lod_clamp - args->base_miplevel);
if (args->filter_width) {
view->descriptor[6] = A6XX_TEX_CONST_6_LOG2_PHASES(
util_logbase2_ceil(args->filter_num_phases) / 2) |
A6XX_TEX_CONST_6_DILATION(1);
} else {
view->descriptor[6] = A6XX_TEX_CONST_6_MIN_LOD_CLAMP(args->min_lod_clamp - args->base_miplevel);
}
if (layout->tile_all)
view->descriptor[3] |= A6XX_TEX_CONST_3_TILE_ALL;
@ -300,6 +310,13 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
assert(args->type != FDL_VIEW_TYPE_3D);
return;
} else if (args->filter_width) {
view->descriptor[8] =
(A6XX_TEX_CONST_8_FILTER_SIZE_X(args->filter_width) |
A6XX_TEX_CONST_8_FILTER_SIZE_Y(args->filter_height));
view->descriptor[10] =
(A6XX_TEX_CONST_10_FILTER_OFFSET_X(args->filter_center_x) |
A6XX_TEX_CONST_10_FILTER_OFFSET_Y(args->filter_center_y));
}
if (ubwc_enabled) {
@ -323,6 +340,8 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
} else if (CHIP >= A8XX) {
uint32_t *descriptor = view->descriptor;
assert(!args->filter_width); /* Need descriptor fields defined. */
descriptor[0] = A8XX_TEX_MEMOBJ_0_BASE_LO(base_addr);
descriptor[1] = A8XX_TEX_MEMOBJ_1_BASE_HI(base_addr >> 32) |
A8XX_TEX_MEMOBJ_1_TYPE(fdl6_tex_type(args->type, false)) |
@ -374,13 +393,23 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
A8XX_TEX_MEMOBJ_9_UV_PITCH(fdl_pitch(layouts[1], args->base_miplevel));
return;
} else if (args->filter_width) {
descriptor[5] |= A8XX_TEX_MEMOBJ_5_FILTER_SIZE_X(args->filter_width) |
A8XX_TEX_MEMOBJ_5_FILTER_SIZE_Y(args->filter_height) |
A8XX_TEX_MEMOBJ_5_FILTER_OFFSET_X(args->filter_center_x) |
A8XX_TEX_MEMOBJ_5_FILTER_OFFSET_Y(args->filter_center_y);
}
descriptor[7] = A8XX_TEX_MEMOBJ_7_ARRAY_SLICE_OFFSET(layer_size);
descriptor[9] = A8XX_TEX_MEMOBJ_9_MIN_LOD_CLAMP(args->min_lod_clamp - args->base_miplevel);
if (args->type == FDL_VIEW_TYPE_3D)
if (args->filter_width) {
descriptor[7] |= A8XX_TEX_MEMOBJ_7_LOG2_PHASES(
util_logbase2_ceil(args->filter_num_phases) / 2) |
A8XX_TEX_MEMOBJ_7_DILATION(1);
} else if (args->type == FDL_VIEW_TYPE_3D) {
descriptor[7] |= A8XX_TEX_MEMOBJ_7_MIN_ARRAY_SLIZE_OFFSET(layout->slices[layout->mip_levels - 1].size0);
}
if (ubwc_enabled) {
uint32_t block_width, block_height;

View file

@ -374,6 +374,12 @@ struct fdl_view_args {
enum pipe_format format;
enum fdl_view_type type;
enum fdl_chroma_location chroma_offsets[2];
uint32_t filter_width;
uint32_t filter_height;
uint32_t filter_center_x;
uint32_t filter_center_y;
uint32_t filter_num_phases;
};
#define FDL6_TEX_CONST_DWORDS 16

View file

@ -1900,7 +1900,7 @@ get_bindless_ref(struct ir3_context *ctx, nir_src *src, bool is_sampler)
static struct tex_src_info
get_bindless_samp_src(struct ir3_context *ctx, nir_src *tex,
nir_src *samp)
nir_src *samp, nir_src *tex2, nir_src *samp2)
{
struct ir3_builder *b = &ctx->build;
struct tex_src_info info = {0};
@ -1912,6 +1912,46 @@ get_bindless_samp_src(struct ir3_context *ctx, nir_src *tex,
*/
struct bindless_ref_info tex_info = get_bindless_ref(ctx, tex, false);
struct bindless_ref_info samp_info = get_bindless_ref(ctx, samp, true);
struct bindless_ref_info tex2_info = get_bindless_ref(ctx, tex2, false);
/* NOTE: The QC implementation completely ignores samp2 (reference
* sampler), in both the A1 and S2EN cases.
*/
if (tex2 || samp2) {
struct tex_src_info info = {0};
info.flags = IR3_INSTR_B;
/* NOTE: QC implementation doesn't encode the BASE_HI bits in the right
* place (ORing them into src2 instead), but our normal base encoding
* appears to work.
*/
info.base = tex_info.desc_set;
info.a1_val = 0;
info.a1_val |= samp_info.desc_set;
info.a1_val |= tex2_info.desc_set << 13;
/* NOTE: QC implementation lets samp index overflow into tex2 index */
if (tex_info.is_const && tex_info.const_index < 16 &&
samp_info.is_const && samp_info.const_index < 16 &&
tex2_info.is_const && tex2_info.const_index < 64) {
info.tex_idx = tex_info.const_index;
info.a1_val |= (samp_info.const_index << 3);
info.a1_val |= (tex2_info.const_index << 7);
} else {
/* Non-constant case: Collect the combined texture/sampler, and the
* secondary texture.
*/
info.samp_tex = ir3_collect(b, tex_info.index, samp_info.index, tex2_info.index);
info.flags |= IR3_INSTR_S2EN;
}
if (info.a1_val)
info.flags |= IR3_INSTR_A1EN;
return info;
}
info.tex_base = tex_info.desc_set;
info.tex_idx = tex_info.const_index;
@ -3411,7 +3451,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
}
case nir_intrinsic_prefetch_sam_ir3: {
struct tex_src_info info =
get_bindless_samp_src(ctx, &intr->src[0], &intr->src[1]);
get_bindless_samp_src(ctx, &intr->src[0], &intr->src[1], NULL, NULL);
struct ir3_instruction *sam =
emit_sam(ctx, OPC_SAM, info, TYPE_F32, 0b1111, NULL, NULL);
@ -3581,13 +3621,17 @@ get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
struct tex_src_info info = {0};
int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
int texture2_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_2_handle);
int sampler2_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_2_handle);
struct ir3_instruction *texture, *sampler;
if (texture_idx >= 0 || sampler_idx >= 0) {
/* Bindless case */
info = get_bindless_samp_src(ctx,
texture_idx >= 0 ? &tex->src[texture_idx].src : NULL,
sampler_idx >= 0 ? &tex->src[sampler_idx].src : NULL);
sampler_idx >= 0 ? &tex->src[sampler_idx].src : NULL,
texture2_idx >= 0 ? &tex->src[texture2_idx].src : NULL,
sampler2_idx >= 0 ? &tex->src[sampler2_idx].src : NULL);
if (tex->texture_non_uniform || tex->sampler_non_uniform)
info.flags |= IR3_INSTR_NONUNIF;
@ -3629,8 +3673,8 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
{
struct ir3_builder *b = &ctx->build;
struct ir3_instruction **dst, *sam, *src0[12], *src1[5];
struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
struct ir3_instruction *lod, *compare, *proj, *sample_index, *min_lod;
struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy, *const *box_size;
struct ir3_instruction *lod, *compare, *proj, *sample_index, *min_lod, *ref_coord, *block_size;
struct tex_src_info info = {0};
bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
bool lod_zero = false, has_min_lod = false;
@ -3641,8 +3685,8 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
ncomp = tex->def.num_components;
coord = off = ddx = ddy = NULL;
lod = proj = compare = sample_index = min_lod = NULL;
coord = off = ddx = ddy = box_size = NULL;
lod = proj = compare = sample_index = min_lod = ref_coord = block_size = NULL;
dst = ir3_get_def(ctx, &tex->def, ncomp);
@ -3691,11 +3735,22 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
min_lod = ir3_get_src(ctx, &tex->src[i].src)[0];
has_min_lod = true;
break;
case nir_tex_src_box_size:
box_size = ir3_get_src(ctx, &tex->src[i].src);
break;
case nir_tex_src_block_size:
block_size = ir3_get_src(ctx, &tex->src[i].src)[0];
break;
case nir_tex_src_ref_coord:
ref_coord = ir3_get_src(ctx, &tex->src[i].src)[0];
break;
case nir_tex_src_texture_offset:
case nir_tex_src_sampler_offset:
case nir_tex_src_texture_handle:
case nir_tex_src_sampler_handle:
/* handled in get_tex_samp_src() */
case nir_tex_src_texture_2_handle:
case nir_tex_src_sampler_2_handle:
/* handled in get_tex_samp_tex_src() */
break;
default:
ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
@ -3767,6 +3822,16 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
case nir_texop_txf_ms:
opc = OPC_ISAMM;
break;
case nir_texop_sample_weighted_qcom:
opc = OPC_IMG_BINDLESS_HOF;
break;
case nir_texop_box_filter_qcom:
opc = OPC_IMG_BINDLESS_PCMN;
break;
case nir_texop_block_match_sad_qcom:
case nir_texop_block_match_ssd_qcom:
opc = OPC_IMG_BINDLESS;
break;
default:
ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
return;
@ -3864,7 +3929,7 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
* - lod
* - bias
*/
if (has_off | has_lod | has_bias | has_min_lod) {
if (has_off | has_lod | has_bias | has_min_lod | (box_size != NULL)) {
if (has_off) {
unsigned off_coords = coords;
if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
@ -3883,6 +3948,16 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
src1[nsrc1++] = min_lod;
flags |= IR3_INSTR_CLP;
}
if (box_size) {
src1[nsrc1++] = box_size[0];
src1[nsrc1++] = box_size[1];
}
}
if (opc == OPC_IMG_BINDLESS) {
src1[nsrc1++] = ref_coord;
src1[nsrc1++] = block_size;
}
type = get_tex_dest_type(tex);
@ -3978,6 +4053,9 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
}
if (tex->op == nir_texop_block_match_ssd_qcom)
sam->cat5.match_mode = IR3_MATCH_MODE_SSD;
if (tex->is_sparse) {
info.flags |= flags;
struct ir3_instruction *rck =

View file

@ -523,6 +523,73 @@ ir3_nir_lower_array_sampler(nir_shader *shader)
nir_metadata_control_flow, NULL);
}
/* pack_uvec2_to_uint does clamping that we don't need to do. */
static nir_def *
pack_16_16(nir_builder *b, nir_def *x)
{
return nir_ior(b, nir_channel(b, x, 0), nir_ishl_imm(b, nir_channel(b, x, 1), 16));
}
static bool
ir3_nir_lower_image_processing_instr(struct nir_builder *b, nir_instr *instr,
void *_data)
{
if (instr->type != nir_instr_type_tex)
return false;
nir_tex_instr *tex = nir_instr_as_tex(instr);
b->cursor = nir_before_instr(&tex->instr);
if (tex->op == nir_texop_box_filter_qcom) {
/* The hardware's box filter arg is preprocessed, but still a vec2. We do
* the preprocessing in NIR so it's more legible, and can be constant
* folded.
*/
int box_size_src = nir_tex_instr_src_index(tex, nir_tex_src_box_size);
assert(box_size_src >= 0);
nir_def *box_size = tex->src[box_size_src].src.ssa;
nir_def *area =
nir_fmul(b, nir_channel(b, box_size, 0), nir_channel(b, box_size, 1));
box_size =
nir_f2u32(b, nir_fround_even(b, nir_fmul_imm(b, box_size, 64.0)));
nir_def *inv_area = nir_u2u32(b, nir_f2f16(b, nir_frcp(b, area)));
nir_src_rewrite(&tex->src[box_size_src].src, nir_vec2(b, pack_16_16(b, box_size), inv_area));
return true;
} else if (tex->op == nir_texop_block_match_sad_qcom ||
tex->op == nir_texop_block_match_ssd_qcom) {
/* Convert the src coords to integer, and pack the ref coord and block
* into u32s each.
*/
int coord_src = nir_tex_instr_src_index(tex, nir_tex_src_coord);
assert(coord_src >= 0);
nir_src_rewrite(&tex->src[coord_src].src, nir_i2f32(b, tex->src[coord_src].src.ssa));
int ref_coord_src = nir_tex_instr_src_index(tex, nir_tex_src_ref_coord);
assert(ref_coord_src >= 0);
nir_src_rewrite(&tex->src[ref_coord_src].src,
pack_16_16(b, tex->src[ref_coord_src].src.ssa));
int block_size_src = nir_tex_instr_src_index(tex, nir_tex_src_block_size);
assert(block_size_src >= 0);
nir_src_rewrite(&tex->src[block_size_src].src,
pack_16_16(b, tex->src[block_size_src].src.ssa));
return true;
} else {
return false;
}
}
static bool
ir3_nir_lower_image_processing(nir_shader *shader)
{
return nir_shader_instructions_pass(shader, ir3_nir_lower_image_processing_instr,
nir_metadata_control_flow, NULL);
}
static bool
lower_shader_clock(struct nir_builder *b, nir_intrinsic_instr *instr, void *data)
{
@ -701,6 +768,8 @@ ir3_finalize_nir(struct ir3_compiler *compiler,
if (compiler->array_index_add_half)
OPT(s, ir3_nir_lower_array_sampler);
OPT(s, ir3_nir_lower_image_processing);
if (compiler->gen >= 6) {
OPT(s, ir3_nir_lower_shader_clock, compiler->options.uche_trap_base);
}

View file

@ -120,6 +120,10 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
<bitfield name="MIN_LOD_CLAMP" low="0" high="11" type="ufixed" radix="8"/>
<!-- pitch for plane 2 / plane 3 -->
<bitfield name="PLANE_PITCH" low="8" high="31" type="uint"/>
<!-- QCOM_image_filtering sample weights descriptor fields, overlapping the others. -->
<bitfield name="LOG2_PHASES" low="0" high="2" type="uint"/>
<bitfield name="DILATION" low="8" high="11" type="uint"/>
</reg32>
<!-- 7/8 is plane 2 address for planar formats -->
<reg32 offset="7" name="7">
@ -127,6 +131,8 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
</reg32>
<reg32 offset="8" name="8">
<bitfield name="FLAG_HI" low="0" high="16"/>
<bitfield name="FILTER_SIZE_X" low="17" high="23"/>
<bitfield name="FILTER_SIZE_Y" low="24" high="30"/>
</reg32>
<!-- 9/10 is plane 3 address for planar formats -->
<reg32 offset="9" name="9">
@ -137,6 +143,8 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
<!-- log2 size of the first level, required for mipmapping -->
<bitfield name="FLAG_BUFFER_LOGW" low="8" high="11" type="uint"/>
<bitfield name="FLAG_BUFFER_LOGH" low="12" high="15" type="uint"/>
<bitfield name="FILTER_OFFSET_X" low="17" high="22"/>
<bitfield name="FILTER_OFFSET_Y" low="23" high="28"/>
</reg32>
<reg32 offset="11" name="11"/>
<reg32 offset="12" name="12"/>

View file

@ -83,6 +83,12 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
<bitfield name="FLAG_BUFFER_PITCH" low="17" high="24" shr="6" type="uint"/>
<bitfield name="ALL_SAMPLES_CENTER" pos="29" type="boolean"/>
<bitfield name="MUTABLEEN" pos="31" type="boolean"/>
<!-- QCOM_image_filtering sample weights descriptor fields, overlapping the others. -->
<bitfield name="FILTER_SIZE_X" low="0" high="6"/>
<bitfield name="FILTER_SIZE_Y" low="7" high="13"/>
<bitfield name="FILTER_OFFSET_X" low="19" high="24"/>
<bitfield name="FILTER_OFFSET_Y" low="25" high="30"/>
</reg32>
<reg32 offset="6" name="6">
<bitfield name="TEX_LINE_OFFSET" low="0" high="23" type="uint"/> <!-- PITCH -->
@ -99,6 +105,10 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
<!-- For multiplanar. This overlaps other single-planar fields: -->
<bitfield name="UV_OFFSET_H" low="24" high="25" type="ufixed" radix="2"/> <!-- CHROMA_MIDPOINT_X -->
<bitfield name="UV_OFFSET_V" low="26" high="27" type="ufixed" radix="2"/> <!-- CHROMA_MIDPOINT_Y -->
<!-- QCOM_image_filtering sample weights descriptor fields, overlapping the others. -->
<bitfield name="DILATION" low="24" high="27" type="uint"/>
<bitfield name="LOG2_PHASES" low="28" high="30" type="uint"/>
</reg32>
<reg32 offset="8" name="8">
<bitfield name="FLAG_ARRAY_PITCH" low="0" high="14" shr="12" type="uint"/> <!-- FLAG_BUFFER_ARRAY_PITCH -->

View file

@ -8744,11 +8744,11 @@ got cmdszdw=416
{ ARRAY_PITCH = 4096 | MIN_LAYERSZ = 0 }
{ BASE_LO = 0x373a000 }
{ BASE_HI = 0x1 | DEPTH = 1 }
{ MIN_LOD_CLAMP = 0.000000 | PLANE_PITCH = 0 }
{ MIN_LOD_CLAMP = 0.000000 | PLANE_PITCH = 0 | LOG2_PHASES = 0 | DILATION = 0 }
{ FLAG_LO = 0 }
{ FLAG_HI = 0 }
{ FLAG_HI = 0 | FILTER_SIZE_X = 0 | FILTER_SIZE_Y = 0 }
{ FLAG_BUFFER_ARRAY_PITCH = 0 }
{ FLAG_BUFFER_PITCH = 0 | FLAG_BUFFER_LOGW = 0 | FLAG_BUFFER_LOGH = 0 }
{ FLAG_BUFFER_PITCH = 0 | FLAG_BUFFER_LOGW = 0 | FLAG_BUFFER_LOGH = 0 | FILTER_OFFSET_X = 0 | FILTER_OFFSET_Y = 0 }
{ 11 = 0 }
{ 12 = 0 }
{ 13 = 0 }

View file

@ -1276,6 +1276,8 @@ tu_update_descriptor_sets(const struct tu_device *device,
break;
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
case VK_DESCRIPTOR_TYPE_SAMPLE_WEIGHT_IMAGE_QCOM:
case VK_DESCRIPTOR_TYPE_BLOCK_MATCH_IMAGE_QCOM:
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
write_image_descriptor(ptr, writeset->descriptorType, writeset->pImageInfo + j);
break;
@ -1621,6 +1623,8 @@ tu_update_descriptor_set_with_template(
break;
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
case VK_DESCRIPTOR_TYPE_SAMPLE_WEIGHT_IMAGE_QCOM:
case VK_DESCRIPTOR_TYPE_BLOCK_MATCH_IMAGE_QCOM:
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
write_image_descriptor(ptr, templ->entry[i].descriptor_type,
(const VkDescriptorImageInfo *) src);

View file

@ -355,6 +355,7 @@ get_device_extensions(const struct tu_physical_device *device,
.IMG_filter_cubic = device->info->props.has_tex_filter_cubic,
.NV_compute_shader_derivatives = device->info->chip >= 7,
.QCOM_fragment_density_map_offset = true,
.QCOM_image_processing = device->info->props.has_image_processing,
.QCOM_multiview_per_view_render_areas = true,
.QCOM_multiview_per_view_viewports =
device->info->props.has_per_view_viewport,
@ -824,6 +825,11 @@ tu_get_features(struct tu_physical_device *pdevice,
/* VK_EXT_zero_initialize_device_memory */
features->zeroInitializeDeviceMemory = true;
/* VK_QCOM_image_processing */
features->textureSampleWeighted = pdevice->vk.supported_extensions.QCOM_image_processing;
features->textureBoxFilter = pdevice->vk.supported_extensions.QCOM_image_processing;
features->textureBlockMatch = pdevice->vk.supported_extensions.QCOM_image_processing;
/* VK_VALVE_fragment_density_map_layered */
features->fragmentDensityMapLayered = true;
@ -1520,6 +1526,21 @@ tu_get_properties(struct tu_physical_device *pdevice,
/* VK_VALVE_fragment_density_map_layered */
props->maxFragmentDensityMapLayers = MAX_VIEWS;
/* VK_QCOM_image_processing */
props->maxWeightFilterPhases = 1024;
props->maxWeightFilterDimension =
pdevice->vk.supported_extensions.QCOM_image_processing
? (VkExtent2D) { 64, 64 }
: (VkExtent2D) { 0, 0 };
props->maxBlockMatchRegion =
pdevice->vk.supported_extensions.QCOM_image_processing
? (VkExtent2D) { 64, 64 }
: (VkExtent2D) { 0, 0 };
props->maxBoxFilterBlockSize =
pdevice->vk.supported_extensions.QCOM_image_processing
? (VkExtent2D) { 64, 64 }
: (VkExtent2D) { 0, 0 };
}
static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = {

View file

@ -251,6 +251,50 @@ tu_physical_device_get_format_properties(
VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT);
}
/* Set up QCOM_imgae_processing flags. This matches blob behavior, except
* that it advertises box/weighted on NPOT sampleable formats and ASTC_FLOAT
* (which we don't advertise yet), and blockmatch/box/weighted on
* VK_FORMAT_G8B8G8R8_422_UNORM.
*/
if ((optimal & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT) &&
(!ycbcr_info || ycbcr_info->n_planes == 1) &&
!vk_format_is_depth_or_stencil(vk_format)) {
int c = util_format_get_first_non_void_channel(desc->format);
bool is_8bpc = c != -1 && desc->is_array && desc->channel[c].size == 8;
if ((is_8bpc && vk_format != VK_FORMAT_B8G8R8A8_UNORM &&
vk_format != VK_FORMAT_B8G8R8A8_SNORM &&
vk_format != VK_FORMAT_B8G8R8A8_SRGB) ||
vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32) {
if (desc->is_unorm &&
desc->colorspace != UTIL_FORMAT_COLORSPACE_SRGB)
optimal |= VK_FORMAT_FEATURE_2_BLOCK_MATCHING_BIT_QCOM;
if ((desc->is_unorm || desc->is_snorm) &&
vk_format != VK_FORMAT_R8G8_SNORM) {
optimal |= VK_FORMAT_FEATURE_2_BOX_FILTER_SAMPLED_BIT_QCOM;
optimal |= VK_FORMAT_FEATURE_2_WEIGHT_SAMPLED_IMAGE_BIT_QCOM;
}
}
if (vk_format == VK_FORMAT_B5G6R5_UNORM_PACK16 ||
vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32 ||
vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 ||
util_format_is_float16(format) ||
(util_format_is_compressed(format) &&
desc->layout != UTIL_FORMAT_LAYOUT_RGTC &&
vk_format != VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK &&
vk_format != VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK &&
vk_format != VK_FORMAT_EAC_R11G11_UNORM_BLOCK &&
vk_format != VK_FORMAT_EAC_R11G11_SNORM_BLOCK)) {
optimal |= VK_FORMAT_FEATURE_2_BOX_FILTER_SAMPLED_BIT_QCOM;
optimal |= VK_FORMAT_FEATURE_2_WEIGHT_SAMPLED_IMAGE_BIT_QCOM;
}
if (vk_format == VK_FORMAT_R8_UNORM ||
vk_format == VK_FORMAT_R16_SFLOAT)
optimal |= VK_FORMAT_FEATURE_2_WEIGHT_IMAGE_BIT_QCOM;
}
/* For the most part, we can do anything with a linear image that we could
* do with a tiled image. However, we can't support sysmem rendering with a
* linear depth texture, because we don't know if there's a bit to control

View file

@ -180,6 +180,8 @@ tu_image_view_init(struct tu_device *device,
vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO);
const struct vk_ycbcr_conversion *conversion = ycbcr_conversion ?
vk_ycbcr_conversion_from_handle(ycbcr_conversion->conversion) : NULL;
const VkImageViewSampleWeightCreateInfoQCOM *sample_weights =
vk_find_struct_const(pCreateInfo->pNext, IMAGE_VIEW_SAMPLE_WEIGHT_CREATE_INFO_QCOM);
vk_image_view_init(&device->vk, &iview->vk, pCreateInfo);
assert(iview->vk.format != VK_FORMAT_UNDEFINED);
@ -268,6 +270,14 @@ tu_image_view_init(struct tu_device *device,
args.chroma_offsets[1] = (enum fdl_chroma_location) conversion->state.chroma_offsets[1];
}
if (sample_weights) {
args.filter_width = sample_weights->filterSize.width;
args.filter_height = sample_weights->filterSize.height;
args.filter_center_x = sample_weights->filterCenter.x;
args.filter_center_y = sample_weights->filterCenter.y;
args.filter_num_phases = sample_weights->numPhases;
}
TU_CALLX(device, fdl6_view_init)(&iview->view, layouts, &args, device->use_z24uint_s8uint);
if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {

View file

@ -90,6 +90,8 @@ tu6_load_state_size(struct tu_pipeline *pipeline,
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_SAMPLE_WEIGHT_IMAGE_QCOM:
case VK_DESCRIPTOR_TYPE_BLOCK_MATCH_IMAGE_QCOM:
/* Textures and UBO's needs a packet for each stage */
count = stage_count;
break;
@ -219,7 +221,8 @@ tu6_emit_load_state(struct tu_device *device,
}
break;
}
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
case VK_DESCRIPTOR_TYPE_BLOCK_MATCH_IMAGE_QCOM: {
tu_foreach_stage(stage, stages) {
/* TODO: We could emit less CP_LOAD_STATE6 if we used
* struct-of-arrays instead of array-of-structs.

View file

@ -757,30 +757,29 @@ lower_tex_ycbcr(const struct tu_pipeline_layout *layout,
}
static bool
lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
struct tu_shader *shader, const struct tu_pipeline_layout *layout,
uint32_t read_only_input_attachments, bool dynamic_renderpass)
uint32_t read_only_input_attachments, bool dynamic_renderpass,
bool ref)
{
lower_tex_ycbcr(layout, b, tex);
int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
int sampler_src_idx = nir_tex_instr_src_index(tex, ref ? nir_tex_src_sampler_2_deref : nir_tex_src_sampler_deref);
if (sampler_src_idx >= 0) {
nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
nir_def *bindless = build_bindless(dev, b, deref, true, shader, layout,
read_only_input_attachments,
dynamic_renderpass);
nir_src_rewrite(&tex->src[sampler_src_idx].src, bindless);
tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
tex->src[sampler_src_idx].src_type = ref ? nir_tex_src_sampler_2_handle : nir_tex_src_sampler_handle;
}
int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
int tex_src_idx = nir_tex_instr_src_index(tex, ref ? nir_tex_src_texture_2_deref : nir_tex_src_texture_deref);
if (tex_src_idx >= 0) {
nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout,
read_only_input_attachments,
dynamic_renderpass);
nir_src_rewrite(&tex->src[tex_src_idx].src, bindless);
tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
tex->src[tex_src_idx].src_type = ref ? nir_tex_src_texture_2_handle : nir_tex_src_texture_handle;
/* for the input attachment case: */
if (!nir_def_is_intrinsic(bindless))
@ -790,6 +789,24 @@ lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
return true;
}
static bool
lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
struct tu_shader *shader, const struct tu_pipeline_layout *layout,
uint32_t read_only_input_attachments, bool dynamic_renderpass)
{
if (tex->op == nir_texop_block_match_sad_qcom ||
tex->op == nir_texop_block_match_ssd_qcom ||
tex->op == nir_texop_sample_weighted_qcom) {
lower_tex_impl(b, tex, dev, shader, layout, read_only_input_attachments, dynamic_renderpass, false);
lower_tex_impl(b, tex, dev, shader, layout, read_only_input_attachments, dynamic_renderpass, true);
} else {
lower_tex_ycbcr(layout, b, tex);
lower_tex_impl(b, tex, dev, shader, layout, read_only_input_attachments, dynamic_renderpass, false);
}
return true;
}
struct lower_instr_params {
struct tu_device *dev;
struct tu_shader *shader;