From 2f25d16653608f69b9dff39678b65dcc67ebed00 Mon Sep 17 00:00:00 2001 From: Emma Anholt Date: Wed, 2 Feb 2022 12:59:54 -0800 Subject: [PATCH] turnip: Use the DRM or KGSL GPU reset status ioctls to report device loss. ANGLE-on-venus-on-turnip and zink-on-turnip want real data here for EGL's reset tests. This required moving the remaining GPU-reset-causing tests from flakes or xfails to skips. Otherwise, the rest of the caselist associated with them ends up being marked as fails as well. The alternative would be to put these tests in their own test groups with tests_per_group = 1, but that didn't seem worth the effort. Or, we could finally do something with https://gitlab.freedesktop.org/anholt/deqp-runner/-/issues/14. Fixes: #5955 Part-of: --- src/freedreno/ci/freedreno-a618-fails.txt | 6 ----- src/freedreno/ci/freedreno-a618-flakes.txt | 7 ------ src/freedreno/ci/freedreno-a618-skips.txt | 13 ++++++++++- src/freedreno/ci/freedreno-a630-fails.txt | 3 --- src/freedreno/ci/freedreno-a630-flakes.txt | 7 ------ src/freedreno/ci/freedreno-a630-skips.txt | 9 ++++++++ src/freedreno/vulkan/tu_device.c | 1 + src/freedreno/vulkan/tu_drm.c | 24 +++++++++++++++++++ src/freedreno/vulkan/tu_kgsl.c | 27 ++++++++++++++++++++++ src/freedreno/vulkan/tu_private.h | 6 +++++ 10 files changed, 79 insertions(+), 24 deletions(-) diff --git a/src/freedreno/ci/freedreno-a618-fails.txt b/src/freedreno/ci/freedreno-a618-fails.txt index d88caea8f84..707870d97a4 100644 --- a/src/freedreno/ci/freedreno-a618-fails.txt +++ b/src/freedreno/ci/freedreno-a618-fails.txt @@ -1,12 +1,6 @@ # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505 dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail -# CTS 1.3.1.0 uprev: -dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap,Fail -dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap,Fail - -spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail - # Fails when TU_DEBUG=forcebin is set gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail diff --git a/src/freedreno/ci/freedreno-a618-flakes.txt b/src/freedreno/ci/freedreno-a618-flakes.txt index 3d0853fde5a..2c3087eb3a5 100644 --- a/src/freedreno/ci/freedreno-a618-flakes.txt +++ b/src/freedreno/ci/freedreno-a618-flakes.txt @@ -4,10 +4,3 @@ dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_2.alpha_opaque dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_4.alpha_opaque - -# Could trip hangcheck timeout -dEQP-VK.api.command_buffers.record_many_draws_primary_2 -dEQP-VK.api.command_buffers.record_many_draws_secondary_2 - -# Sometimes hangchecks -spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code diff --git a/src/freedreno/ci/freedreno-a618-skips.txt b/src/freedreno/ci/freedreno-a618-skips.txt index 2e625669aca..9e2cb4caafe 100644 --- a/src/freedreno/ci/freedreno-a618-skips.txt +++ b/src/freedreno/ci/freedreno-a618-skips.txt @@ -25,11 +25,22 @@ dEQP-VK.ubo.random.all_shared_buffer.48 # Still running after 3 hours, time is spent in batch_draw_tracking(). KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs -# causes a hangcheck timeout on a630: +# causes a hangcheck timeout on a618: # msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A618: hangcheck detected gpu lockup rb 0! +# +# even if they sometimes pass and could be categorized as flakes, we skip them +# because device loss will end up failing the rest of the caselist. +dEQP-VK.api.command_buffers.record_many_draws_primary_2 +dEQP-VK.api.command_buffers.record_many_draws_secondary_2 dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite +spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies spill-dEQP-VK.graphicsfuzz.cov-nested-loop-undefined-smoothstep-never-executed +spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-memory-accesses +# Hangs the GPU, fixed to be a skip in VK-GL-CTS 736eec57dc0c ("Fix checkSupport in compressed texture sampling tests") +dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap +dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap + # Crashes in RA, but slow enough to get there that CI times out sometimes dEQP-VK.spirv_assembly.instruction.*.spirv_ids_abuse.lots_ids.* diff --git a/src/freedreno/ci/freedreno-a630-fails.txt b/src/freedreno/ci/freedreno-a630-fails.txt index bfdb31e70de..58080e09381 100644 --- a/src/freedreno/ci/freedreno-a630-fails.txt +++ b/src/freedreno/ci/freedreno-a630-fails.txt @@ -34,9 +34,6 @@ bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.softlight,Fail # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505 dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail -# Showed up with VK-GL-CTS 1.3.1.0: -spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail - # Fails when TU_DEBUG=forcebin is set gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail diff --git a/src/freedreno/ci/freedreno-a630-flakes.txt b/src/freedreno/ci/freedreno-a630-flakes.txt index 4e7f435402a..69395851673 100644 --- a/src/freedreno/ci/freedreno-a630-flakes.txt +++ b/src/freedreno/ci/freedreno-a630-flakes.txt @@ -91,13 +91,6 @@ dEQP-GLES31.functional.layout_binding.ssbo.fragment_binding_array dEQP-GLES3.functional.fbo.blit.conversion.rg8i_to_r16i dEQP-GLES3.functional.fbo.blit.conversion.rg8_to_r16f -# Could trip hangcheck timeout -dEQP-VK.api.command_buffers.record_many_draws_primary_2 -dEQP-VK.api.command_buffers.record_many_draws_secondary_2 - -# Looks likely to be a hangcheck trigger. -spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components - # First noticed Jun 1 2020 on an innocent branch. KHR-GL33.packed_depth_stencil.verify_copy_tex_image.depth32f_stencil8 diff --git a/src/freedreno/ci/freedreno-a630-skips.txt b/src/freedreno/ci/freedreno-a630-skips.txt index a05d7144341..b7d49e80510 100644 --- a/src/freedreno/ci/freedreno-a630-skips.txt +++ b/src/freedreno/ci/freedreno-a630-skips.txt @@ -18,6 +18,15 @@ dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_equal_spacing dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_even_spacing dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_odd_spacing +# Can cause a hangcheck. +# +# even if they sometimes pass and could be categorized as flakes, we skip them +# because device loss will end up failing the rest of the caselist. +dEQP-VK.api.command_buffers.record_many_draws_primary_2 +dEQP-VK.api.command_buffers.record_many_draws_secondary_2 +spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies +spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components + # timeout, spending all its time in nir_compare_deref_paths() # https://gitlab.freedesktop.org/mesa/mesa/-/issues/5152 dEQP-VK.ubo.random.all_shared_buffer.48 diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index a1684b848f2..b1ba78dc37c 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1699,6 +1699,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, device->instance = physical_device->instance; device->physical_device = physical_device; device->fd = physical_device->local_fd; + device->vk.check_status = tu_device_check_status; mtx_init(&device->bo_mutex, mtx_plain); u_rwlock_init(&device->dma_bo_lock); diff --git a/src/freedreno/vulkan/tu_drm.c b/src/freedreno/vulkan/tu_drm.c index 92ff8755c1e..9e46b08a1f0 100644 --- a/src/freedreno/vulkan/tu_drm.c +++ b/src/freedreno/vulkan/tu_drm.c @@ -137,6 +137,23 @@ tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count) return ret; } +VkResult +tu_device_check_status(struct vk_device *vk_device) +{ + struct tu_device *device = container_of(vk_device, struct tu_device, vk); + struct tu_physical_device *physical_device = device->physical_device; + + uint64_t last_fault_count = physical_device->fault_count; + int ret = tu_drm_get_param(physical_device, MSM_PARAM_FAULTS, &physical_device->fault_count); + if (ret != 0) + return vk_device_set_lost(&device->vk, "error getting GPU fault count: %d", ret); + + if (last_fault_count != physical_device->fault_count) + return vk_device_set_lost(&device->vk, "GPU faulted or hung"); + + return VK_SUCCESS; +} + int tu_drm_submitqueue_new(const struct tu_device *dev, int priority, @@ -729,6 +746,13 @@ tu_drm_device_init(struct tu_physical_device *device, goto fail; } + int ret = tu_drm_get_param(device, MSM_PARAM_FAULTS, &device->fault_count); + if (ret != 0) { + result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "Failed to get initial fault count: %d", ret); + goto fail; + } + device->syncobj_type = vk_drm_syncobj_get_type(fd); device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type); diff --git a/src/freedreno/vulkan/tu_kgsl.c b/src/freedreno/vulkan/tu_kgsl.c index 0caece7ffea..55fae932da3 100644 --- a/src/freedreno/vulkan/tu_kgsl.c +++ b/src/freedreno/vulkan/tu_kgsl.c @@ -706,6 +706,33 @@ tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count) return 0; } +VkResult +tu_device_check_status(struct vk_device *vk_device) +{ + struct tu_device *device = container_of(vk_device, struct tu_device, vk); + + for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { + for (unsigned q = 0; q < device->queue_count[i]; q++) { + /* KGSL's KGSL_PROP_GPU_RESET_STAT takes the u32 msm_queue_id and returns a + * KGSL_CTX_STAT_* for the worst reset that happened since the last time it + * was queried on that queue. + */ + uint32_t value = device->queues[i][q].msm_queue_id; + VkResult status = get_kgsl_prop(device->fd, KGSL_PROP_GPU_RESET_STAT, + &value, sizeof(value)); + if (status != VK_SUCCESS) + return vk_device_set_lost(&device->vk, "Failed to get GPU reset status"); + + if (value != KGSL_CTX_STAT_NO_ERROR && + value != KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT) { + return vk_device_set_lost(&device->vk, "GPU faulted or hung"); + } + } + } + + return VK_SUCCESS; +} + #ifdef ANDROID VKAPI_ATTR VkResult VKAPI_CALL tu_QueueSignalReleaseImageANDROID(VkQueue _queue, diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 50f6a648a4d..2d0e4e756c3 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -236,6 +236,9 @@ struct tu_physical_device int msm_major_version; int msm_minor_version; + /* Address space and global fault count for this local_fd with DRM backend */ + uint64_t fault_count; + /* This is the drivers on-disk cache used as a fallback as opposed to * the pipeline cache defined by apps. */ @@ -538,6 +541,9 @@ tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj uint64_t tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts); +VkResult +tu_device_check_status(struct vk_device *vk_device); + enum tu_bo_alloc_flags { TU_BO_ALLOC_NO_FLAGS = 0,