diff --git a/src/gallium/drivers/crocus/crocus_state.c b/src/gallium/drivers/crocus/crocus_state.c index 54b9b8c4c75..711d99d708f 100644 --- a/src/gallium/drivers/crocus/crocus_state.c +++ b/src/gallium/drivers/crocus/crocus_state.c @@ -4843,8 +4843,9 @@ crocus_populate_fs_key(const struct crocus_context *ice, key->flat_shade = rast->cso.flatshade && (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1)); - key->persample_interp = rast->cso.force_persample_interp; key->multisample_fbo = rast->cso.multisample && fb->samples > 1; + key->persample_interp = + rast->cso.force_persample_interp ? BRW_ALWAYS : BRW_NEVER; key->ignore_sample_mask_out = !key->multisample_fbo; key->coherent_fb_fetch = false; // TODO: needed? diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index 765bb51ec77..3a1c023b64b 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -159,7 +159,7 @@ iris_to_brw_fs_key(const struct iris_screen *screen, .alpha_test_replicate_alpha = key->alpha_test_replicate_alpha, .alpha_to_coverage = key->alpha_to_coverage, .clamp_fragment_color = key->clamp_fragment_color, - .persample_interp = key->persample_interp, + .persample_interp = key->persample_interp ? BRW_ALWAYS : BRW_NEVER, .multisample_fbo = key->multisample_fbo, .force_dual_color_blend = key->force_dual_color_blend, .coherent_fb_fetch = key->coherent_fb_fetch, diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 666763f16a8..5c02c0e2843 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -6523,7 +6523,7 @@ iris_upload_dirty_render_state(struct iris_context *ice, wm.StatisticsEnable = ice->state.statistics_counters_enabled; wm.BarycentricInterpolationMode = - wm_prog_data->barycentric_interp_modes; + wm_prog_data_barycentric_modes(wm_prog_data, 0); if (wm_prog_data->early_fragment_tests) wm.EarlyDepthStencilControl = EDSC_PREPS; diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 8bcbe429f7a..c45e1a84eb5 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -498,7 +498,7 @@ struct brw_wm_prog_key { * us to run per-sample. Even when running per-sample due to gl_SampleID, * we may still interpolate unqualified inputs at the pixel center. */ - bool persample_interp:1; + enum brw_sometimes persample_interp:2; bool multisample_fbo:1; enum brw_sometimes line_aa:2; @@ -507,7 +507,7 @@ struct brw_wm_prog_key { bool ignore_sample_mask_out:1; bool coarse_pixel:1; - uint64_t padding:58; + uint64_t padding:57; }; struct brw_cs_prog_key { @@ -828,6 +828,10 @@ enum brw_barycentric_mode { BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5, BRW_BARYCENTRIC_MODE_COUNT = 6 }; +#define BRW_BARYCENTRIC_PERSPECTIVE_BITS \ + ((1 << BRW_BARYCENTRIC_PERSPECTIVE_PIXEL) | \ + (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID) | \ + (1 << BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE)) #define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \ ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \ (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \ @@ -854,6 +858,9 @@ enum brw_wm_msaa_flags { /** True if this shader has been dispatched per-sample */ BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH = (1 << 2), + /** True if inputs should be interpolated per-sample by default */ + BRW_WM_MSAA_FLAG_PERSAMPLE_INTERP = (1 << 3), + /** True if this shader has been dispatched coarse * * This is intentionally chose to be bit 18 to correspond to the coarse @@ -1090,6 +1097,56 @@ brw_wm_prog_data_is_persample(const struct brw_wm_prog_data *prog_data, return prog_data->persample_dispatch; } +static inline uint32_t +wm_prog_data_barycentric_modes(const struct brw_wm_prog_data *prog_data, + enum brw_wm_msaa_flags pushed_msaa_flags) +{ + uint32_t modes = prog_data->barycentric_interp_modes; + + if (pushed_msaa_flags & BRW_WM_MSAA_FLAG_PERSAMPLE_INTERP) { + assert(pushed_msaa_flags & BRW_WM_MSAA_FLAG_ENABLE_DYNAMIC); + + assert(prog_data->persample_dispatch == BRW_ALWAYS || + (pushed_msaa_flags & BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH)); + + /* Making dynamic per-sample interpolation work is a bit tricky. The + * hardware will hang if SAMPLE is requested but per-sample dispatch is + * not enabled. This means we can't preemptively add SAMPLE to the + * barycentrics bitfield. Instead, we have to add it late and only + * on-demand. Annoyingly, changing the number of barycentrics requested + * changes the whole PS shader payload so we very much don't want to do + * that. Instead, if the dynamic per-sample interpolation flag is set, + * we check to see if SAMPLE was requested and, if not, replace the + * highest barycentric bit in the [non]perspective grouping (CENTROID, + * if it exists, else PIXEL) with SAMPLE. The shader will stomp all the + * barycentrics in the shader with SAMPLE so it really doesn't matter + * which one we replace. The important thing is that we keep the number + * of barycentrics in each [non]perspective grouping the same. + */ + if ((modes & BRW_BARYCENTRIC_PERSPECTIVE_BITS) && + !(modes & BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE))) { + int sample_mode = + util_last_bit(modes & BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1; + assert(modes & BITFIELD_BIT(sample_mode)); + + modes &= ~BITFIELD_BIT(sample_mode); + modes |= BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE); + } + + if ((modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) && + !(modes & BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) { + int sample_mode = + util_last_bit(modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1; + assert(modes & BITFIELD_BIT(sample_mode)); + + modes &= ~BITFIELD_BIT(sample_mode); + modes |= BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE); + } + } + + return modes; +} + static inline bool brw_wm_prog_data_is_coarse(const struct brw_wm_prog_data *prog_data, enum brw_wm_msaa_flags pushed_msaa_flags) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 98aab8253ee..85e21560414 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -7284,9 +7284,10 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, shader->info.fs.uses_sample_shading || shader->info.outputs_read; - prog_data->persample_dispatch = BRW_NEVER; - if (key->multisample_fbo && - (key->persample_interp || prog_data->sample_shading)) + assert(key->multisample_fbo || key->persample_interp == BRW_NEVER); + + prog_data->persample_dispatch = key->persample_interp; + if (key->multisample_fbo && prog_data->sample_shading) prog_data->persample_dispatch = BRW_ALWAYS; if (devinfo->ver >= 6) { diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index c9ab1c454de..791ef261c6c 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -289,6 +289,7 @@ fs_visitor::emit_interpolation_setup_gfx6() this->pixel_x = vgrf(glsl_type::float_type); this->pixel_y = vgrf(glsl_type::float_type); + const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) this->key; struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data); fs_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */ @@ -394,48 +395,58 @@ fs_visitor::emit_interpolation_setup_gfx6() fs_reg half_int_pixel_offset_x, half_int_pixel_offset_y; switch (wm_prog_data->coarse_pixel_dispatch) { case BRW_NEVER: -#define COPY_OFFSET_REG(prefix, suffix) \ - prefix##_pixel_##suffix = prefix##_sample_##suffix; - - COPY_OFFSET_REG(int, offset_x) - COPY_OFFSET_REG(int, offset_y) - COPY_OFFSET_REG(int, offset_xy) - COPY_OFFSET_REG(half_int, offset_x) - COPY_OFFSET_REG(half_int, offset_y) - -#undef COPY_OFFSET_REG + int_pixel_offset_x = int_sample_offset_x; + int_pixel_offset_y = int_sample_offset_y; + int_pixel_offset_xy = int_sample_offset_xy; + half_int_pixel_offset_x = half_int_sample_offset_x; + half_int_pixel_offset_y = half_int_sample_offset_y; break; - case BRW_SOMETIMES: - check_dynamic_msaa_flag(bld, wm_prog_data, + case BRW_SOMETIMES: { + const fs_builder dbld = + abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0); + + check_dynamic_msaa_flag(dbld, wm_prog_data, BRW_WM_MSAA_FLAG_COARSE_DISPATCH); -#define COPY_OFFSET_REG(prefix, suffix) \ - prefix##_pixel_##suffix = bld.vgrf(BRW_REGISTER_TYPE_UW); \ - bld.SEL(prefix##_pixel_##suffix, \ - prefix##_coarse_##suffix, \ - prefix##_pixel_##suffix); \ + int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + dbld.SEL(int_pixel_offset_x, + int_coarse_offset_x, + int_sample_offset_x)); - COPY_OFFSET_REG(int, offset_x) - COPY_OFFSET_REG(int, offset_y) - COPY_OFFSET_REG(int, offset_xy) - COPY_OFFSET_REG(half_int, offset_x) - COPY_OFFSET_REG(half_int, offset_y) + int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + dbld.SEL(int_pixel_offset_y, + int_coarse_offset_y, + int_sample_offset_y)); -#undef COPY_OFFSET_REG + int_pixel_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + dbld.SEL(int_pixel_offset_xy, + int_coarse_offset_xy, + int_sample_offset_xy)); + + half_int_pixel_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + bld.SEL(half_int_pixel_offset_x, + half_int_coarse_offset_x, + half_int_sample_offset_x)); + + half_int_pixel_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + bld.SEL(half_int_pixel_offset_y, + half_int_coarse_offset_y, + half_int_sample_offset_y)); break; + } case BRW_ALWAYS: -#define COPY_OFFSET_REG(prefix, suffix) \ - prefix##_pixel_##suffix = prefix##_coarse_##suffix; - - COPY_OFFSET_REG(int, offset_x) - COPY_OFFSET_REG(int, offset_y) - COPY_OFFSET_REG(int, offset_xy) - COPY_OFFSET_REG(half_int, offset_x) - COPY_OFFSET_REG(half_int, offset_y) - -#undef COPY_OFFSET_REG + int_pixel_offset_x = int_coarse_offset_x; + int_pixel_offset_y = int_coarse_offset_y; + int_pixel_offset_xy = int_coarse_offset_xy; + half_int_pixel_offset_x = half_int_coarse_offset_x; + half_int_pixel_offset_y = half_int_coarse_offset_y; break; } @@ -605,6 +616,55 @@ fs_visitor::emit_interpolation_setup_gfx6() abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w); } + if (wm_key->persample_interp == BRW_SOMETIMES) { + assert(!devinfo->needs_unlit_centroid_workaround); + + const fs_builder ubld = bld.exec_all().group(16, 0); + bool loaded_flag = false; + + for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { + if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i))) + continue; + + /* The sample mode will always be the top bit set in the perspective + * or non-perspective section. In the case where no SAMPLE mode was + * requested, wm_prog_data_barycentric_modes() will swap out the top + * mode for SAMPLE so this works regardless of whether SAMPLE was + * requested or not. + */ + int sample_mode; + if (BITFIELD_BIT(i) & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) { + sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes & + BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1; + } else { + sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes & + BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1; + } + assert(wm_prog_data->barycentric_interp_modes & + BITFIELD_BIT(sample_mode)); + + if (i == sample_mode) + continue; + + uint8_t *barys = fs_payload().barycentric_coord_reg[i]; + + uint8_t *sample_barys = fs_payload().barycentric_coord_reg[sample_mode]; + assert(barys[0] && sample_barys[0]); + + if (!loaded_flag) { + check_dynamic_msaa_flag(ubld, wm_prog_data, + BRW_WM_MSAA_FLAG_PERSAMPLE_INTERP); + } + + for (unsigned j = 0; j < dispatch_width / 8; j++) { + fs_inst *mov = + ubld.MOV(brw_vec8_grf(barys[j / 2] + (j % 2) * 2, 0), + brw_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)); + mov->predicate = BRW_PREDICATE_NORMAL; + } + } + } + for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { this->delta_xy[i] = fetch_barycentric_reg( bld, fs_payload().barycentric_coord_reg[i]); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index a0da8baa387..cd6994cd589 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -555,7 +555,7 @@ brw_nir_lower_fs_inputs(nir_shader *nir, if (!key->multisample_fbo) { nir_lower_single_sampled(nir); - } else if (key->persample_interp) { + } else if (key->persample_interp == BRW_ALWAYS) { nir_shader_instructions_pass(nir, lower_barycentric_per_sample, nir_metadata_block_index | nir_metadata_dominance, diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 08132529959..11e952ba0ad 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -556,8 +556,10 @@ populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline, * harmless to compute it and then let dead-code take care of it. */ if (ms->rasterization_samples > 1) { - key->persample_interp = ms->sample_shading_enable && - (ms->min_sample_shading * ms->rasterization_samples) > 1; + key->persample_interp = + (ms->sample_shading_enable && + (ms->min_sample_shading * ms->rasterization_samples) > 1) ? + BRW_ALWAYS : BRW_NEVER; key->multisample_fbo = true; } diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 983d4f5c904..7b8eae3427e 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -1500,7 +1500,7 @@ emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, wm_prog_data->uses_kill; wm.BarycentricInterpolationMode = - wm_prog_data->barycentric_interp_modes; + wm_prog_data_barycentric_modes(wm_prog_data, 0); } GENX(3DSTATE_WM_pack)(NULL, pipeline->gfx8.wm, &wm); @@ -1615,7 +1615,8 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline, #if GFX_VER >= 11 ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients = wm_prog_data->uses_depth_w_coefficients; - ps.PixelShaderIsPerCoarsePixel = wm_prog_data->coarse_pixel_dispatch; + ps.PixelShaderIsPerCoarsePixel = + brw_wm_prog_data_is_coarse(wm_prog_data, 0); #endif #if GFX_VERx10 >= 125 /* TODO: We should only require this when the last geometry shader uses diff --git a/src/intel/vulkan_hasvk/anv_pipeline.c b/src/intel/vulkan_hasvk/anv_pipeline.c index c756441967a..4318f9d1658 100644 --- a/src/intel/vulkan_hasvk/anv_pipeline.c +++ b/src/intel/vulkan_hasvk/anv_pipeline.c @@ -373,8 +373,10 @@ populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline, * harmless to compute it and then let dead-code take care of it. */ if (ms->rasterization_samples > 1) { - key->persample_interp = ms->sample_shading_enable && - (ms->min_sample_shading * ms->rasterization_samples) > 1; + key->persample_interp = + (ms->sample_shading_enable && + (ms->min_sample_shading * ms->rasterization_samples) > 1) ? + BRW_ALWAYS : BRW_NEVER; key->multisample_fbo = true; } diff --git a/src/intel/vulkan_hasvk/genX_pipeline.c b/src/intel/vulkan_hasvk/genX_pipeline.c index a22df5bda73..39da75d377e 100644 --- a/src/intel/vulkan_hasvk/genX_pipeline.c +++ b/src/intel/vulkan_hasvk/genX_pipeline.c @@ -1580,7 +1580,7 @@ emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, #endif wm.BarycentricInterpolationMode = - wm_prog_data->barycentric_interp_modes; + wm_prog_data_barycentric_modes(wm_prog_data, 0); #if GFX_VER < 8 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;