diff --git a/src/freedreno/ci/freedreno-a618-fails.txt b/src/freedreno/ci/freedreno-a618-fails.txt index d88caea8f84..707870d97a4 100644 --- a/src/freedreno/ci/freedreno-a618-fails.txt +++ b/src/freedreno/ci/freedreno-a618-fails.txt @@ -1,12 +1,6 @@ # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505 dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail -# CTS 1.3.1.0 uprev: -dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap,Fail -dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap,Fail - -spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail - # Fails when TU_DEBUG=forcebin is set gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail diff --git a/src/freedreno/ci/freedreno-a618-flakes.txt b/src/freedreno/ci/freedreno-a618-flakes.txt index 3d0853fde5a..2c3087eb3a5 100644 --- a/src/freedreno/ci/freedreno-a618-flakes.txt +++ b/src/freedreno/ci/freedreno-a618-flakes.txt @@ -4,10 +4,3 @@ dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_2.alpha_opaque dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_4.alpha_opaque - -# Could trip hangcheck timeout -dEQP-VK.api.command_buffers.record_many_draws_primary_2 -dEQP-VK.api.command_buffers.record_many_draws_secondary_2 - -# Sometimes hangchecks -spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code diff --git a/src/freedreno/ci/freedreno-a618-skips.txt b/src/freedreno/ci/freedreno-a618-skips.txt index 2e625669aca..9e2cb4caafe 100644 --- a/src/freedreno/ci/freedreno-a618-skips.txt +++ b/src/freedreno/ci/freedreno-a618-skips.txt @@ -25,11 +25,22 @@ dEQP-VK.ubo.random.all_shared_buffer.48 # Still running after 3 hours, time is spent in batch_draw_tracking(). KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs -# causes a hangcheck timeout on a630: +# causes a hangcheck timeout on a618: # msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A618: hangcheck detected gpu lockup rb 0! +# +# even if they sometimes pass and could be categorized as flakes, we skip them +# because device loss will end up failing the rest of the caselist. +dEQP-VK.api.command_buffers.record_many_draws_primary_2 +dEQP-VK.api.command_buffers.record_many_draws_secondary_2 dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite +spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies spill-dEQP-VK.graphicsfuzz.cov-nested-loop-undefined-smoothstep-never-executed +spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-memory-accesses +# Hangs the GPU, fixed to be a skip in VK-GL-CTS 736eec57dc0c ("Fix checkSupport in compressed texture sampling tests") +dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap +dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap + # Crashes in RA, but slow enough to get there that CI times out sometimes dEQP-VK.spirv_assembly.instruction.*.spirv_ids_abuse.lots_ids.* diff --git a/src/freedreno/ci/freedreno-a630-fails.txt b/src/freedreno/ci/freedreno-a630-fails.txt index bfdb31e70de..58080e09381 100644 --- a/src/freedreno/ci/freedreno-a630-fails.txt +++ b/src/freedreno/ci/freedreno-a630-fails.txt @@ -34,9 +34,6 @@ bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.softlight,Fail # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505 dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail -# Showed up with VK-GL-CTS 1.3.1.0: -spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail - # Fails when TU_DEBUG=forcebin is set gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail diff --git a/src/freedreno/ci/freedreno-a630-flakes.txt b/src/freedreno/ci/freedreno-a630-flakes.txt index 4e7f435402a..69395851673 100644 --- a/src/freedreno/ci/freedreno-a630-flakes.txt +++ b/src/freedreno/ci/freedreno-a630-flakes.txt @@ -91,13 +91,6 @@ dEQP-GLES31.functional.layout_binding.ssbo.fragment_binding_array dEQP-GLES3.functional.fbo.blit.conversion.rg8i_to_r16i dEQP-GLES3.functional.fbo.blit.conversion.rg8_to_r16f -# Could trip hangcheck timeout -dEQP-VK.api.command_buffers.record_many_draws_primary_2 -dEQP-VK.api.command_buffers.record_many_draws_secondary_2 - -# Looks likely to be a hangcheck trigger. -spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components - # First noticed Jun 1 2020 on an innocent branch. KHR-GL33.packed_depth_stencil.verify_copy_tex_image.depth32f_stencil8 diff --git a/src/freedreno/ci/freedreno-a630-skips.txt b/src/freedreno/ci/freedreno-a630-skips.txt index a05d7144341..b7d49e80510 100644 --- a/src/freedreno/ci/freedreno-a630-skips.txt +++ b/src/freedreno/ci/freedreno-a630-skips.txt @@ -18,6 +18,15 @@ dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_equal_spacing dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_even_spacing dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_odd_spacing +# Can cause a hangcheck. +# +# even if they sometimes pass and could be categorized as flakes, we skip them +# because device loss will end up failing the rest of the caselist. +dEQP-VK.api.command_buffers.record_many_draws_primary_2 +dEQP-VK.api.command_buffers.record_many_draws_secondary_2 +spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies +spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components + # timeout, spending all its time in nir_compare_deref_paths() # https://gitlab.freedesktop.org/mesa/mesa/-/issues/5152 dEQP-VK.ubo.random.all_shared_buffer.48 diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index a1684b848f2..b1ba78dc37c 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1699,6 +1699,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, device->instance = physical_device->instance; device->physical_device = physical_device; device->fd = physical_device->local_fd; + device->vk.check_status = tu_device_check_status; mtx_init(&device->bo_mutex, mtx_plain); u_rwlock_init(&device->dma_bo_lock); diff --git a/src/freedreno/vulkan/tu_drm.c b/src/freedreno/vulkan/tu_drm.c index 92ff8755c1e..9e46b08a1f0 100644 --- a/src/freedreno/vulkan/tu_drm.c +++ b/src/freedreno/vulkan/tu_drm.c @@ -137,6 +137,23 @@ tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count) return ret; } +VkResult +tu_device_check_status(struct vk_device *vk_device) +{ + struct tu_device *device = container_of(vk_device, struct tu_device, vk); + struct tu_physical_device *physical_device = device->physical_device; + + uint64_t last_fault_count = physical_device->fault_count; + int ret = tu_drm_get_param(physical_device, MSM_PARAM_FAULTS, &physical_device->fault_count); + if (ret != 0) + return vk_device_set_lost(&device->vk, "error getting GPU fault count: %d", ret); + + if (last_fault_count != physical_device->fault_count) + return vk_device_set_lost(&device->vk, "GPU faulted or hung"); + + return VK_SUCCESS; +} + int tu_drm_submitqueue_new(const struct tu_device *dev, int priority, @@ -729,6 +746,13 @@ tu_drm_device_init(struct tu_physical_device *device, goto fail; } + int ret = tu_drm_get_param(device, MSM_PARAM_FAULTS, &device->fault_count); + if (ret != 0) { + result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "Failed to get initial fault count: %d", ret); + goto fail; + } + device->syncobj_type = vk_drm_syncobj_get_type(fd); device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type); diff --git a/src/freedreno/vulkan/tu_kgsl.c b/src/freedreno/vulkan/tu_kgsl.c index 0caece7ffea..55fae932da3 100644 --- a/src/freedreno/vulkan/tu_kgsl.c +++ b/src/freedreno/vulkan/tu_kgsl.c @@ -706,6 +706,33 @@ tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count) return 0; } +VkResult +tu_device_check_status(struct vk_device *vk_device) +{ + struct tu_device *device = container_of(vk_device, struct tu_device, vk); + + for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { + for (unsigned q = 0; q < device->queue_count[i]; q++) { + /* KGSL's KGSL_PROP_GPU_RESET_STAT takes the u32 msm_queue_id and returns a + * KGSL_CTX_STAT_* for the worst reset that happened since the last time it + * was queried on that queue. + */ + uint32_t value = device->queues[i][q].msm_queue_id; + VkResult status = get_kgsl_prop(device->fd, KGSL_PROP_GPU_RESET_STAT, + &value, sizeof(value)); + if (status != VK_SUCCESS) + return vk_device_set_lost(&device->vk, "Failed to get GPU reset status"); + + if (value != KGSL_CTX_STAT_NO_ERROR && + value != KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT) { + return vk_device_set_lost(&device->vk, "GPU faulted or hung"); + } + } + } + + return VK_SUCCESS; +} + #ifdef ANDROID VKAPI_ATTR VkResult VKAPI_CALL tu_QueueSignalReleaseImageANDROID(VkQueue _queue, diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 50f6a648a4d..2d0e4e756c3 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -236,6 +236,9 @@ struct tu_physical_device int msm_major_version; int msm_minor_version; + /* Address space and global fault count for this local_fd with DRM backend */ + uint64_t fault_count; + /* This is the drivers on-disk cache used as a fallback as opposed to * the pipeline cache defined by apps. */ @@ -538,6 +541,9 @@ tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj uint64_t tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts); +VkResult +tu_device_check_status(struct vk_device *vk_device); + enum tu_bo_alloc_flags { TU_BO_ALLOC_NO_FLAGS = 0,