/*
 * Copyright © 2026 Raspberry Pi Ltd
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
#ifndef V3DV_DEVICE_H
#define V3DV_DEVICE_H

#include "v3dv_common.h"
#include "v3dv_bo.h"
#include "v3dv_limits.h"
#include "v3dv_pipeline.h"
#include "vk_device.h"
#include "vk_device_memory.h"
#include "vk_instance.h"
#include "vk_physical_device.h"
#include "vk_queue.h"
#include "vk_sync.h"
#include "vk_sync_timeline.h"
#include "common/v3d_device_info.h"
#include "wsi_common.h"
#include "util/sparse_array.h"
#include "util/xmlconfig.h"

struct v3dv_event;
struct v3dv_format;
struct v3dv_format_plane;
struct v3dv_image;
struct v3dv_image_view;
struct v3dv_job;
struct v3d_compiler;
struct v3d_perfcntrs;
struct v3d_simulator_file;

struct v3dv_physical_device {
   struct vk_physical_device vk;

   char *name;

   /* primary node (cardN) of the render device */
   int32_t primary_fd;
   /* render node (renderN) of the render device */
   int32_t render_fd;
   /* primary node (cardN) of the display device, if available */
   int32_t display_fd;

   /* We need these because it is not clear how to detect
    * valid devids in a portable way
     */
   bool has_primary;
   bool has_render;

   dev_t primary_devid;
   dev_t render_devid;

   uint8_t driver_build_sha1[BLAKE3_KEY_LEN];
   uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
   uint8_t device_uuid[VK_UUID_SIZE];
   uint8_t driver_uuid[VK_UUID_SIZE];

   struct vk_sync_type drm_syncobj_type;
   struct vk_sync_timeline_type sync_timeline_type;
   const struct vk_sync_type *sync_types[3];

   struct disk_cache *disk_cache;

   mtx_t mutex;

   struct wsi_device wsi_device;

   VkPhysicalDeviceMemoryProperties memory;

   struct v3d_device_info devinfo;
   struct v3d_perfcntrs *perfcntr;

#if USE_V3D_SIMULATOR
   struct v3d_simulator_file *sim_file;
#endif

   const struct v3d_compiler *compiler;
   uint32_t next_program_id;

   alignas(8) uint64_t heap_used;

   /* This array holds all our 'struct v3dv_bo' allocations. We use this
    * so we can add a refcount to our BOs and check if a particular BO
    * was already allocated in this device using its GEM handle. This is
    * necessary to properly manage BO imports, because the kernel doesn't
    * refcount the underlying BO memory.
    *
    * Specifically, when self-importing (i.e. importing a BO into the same
    * device that created it), the kernel will give us the same BO handle
    * for both BOs and we must only free it once when  both references are
    * freed. Otherwise, if we are not self-importing, we get two different BO
    * handles, and we want to free each one individually.
    *
    * The BOs in this map all have a refcnt with the reference counter and
    * only self-imported BOs will ever have a refcnt > 1.
    */
   struct util_sparse_array bo_map;

   struct {
      bool merge_jobs;
   } options;

   struct {
      bool cpu_queue;
      bool multisync;
      bool perfmon;
   } caps;
};

static inline struct v3dv_bo *
v3dv_device_lookup_bo(struct v3dv_physical_device *device, uint32_t handle)
{
   return (struct v3dv_bo *) util_sparse_array_get(&device->bo_map, handle);
}

VkResult v3dv_wsi_init(struct v3dv_physical_device *physical_device);
void v3dv_wsi_finish(struct v3dv_physical_device *physical_device);

void v3dv_meta_clear_init(struct v3dv_device *device);
void v3dv_meta_clear_finish(struct v3dv_device *device);

void v3dv_meta_blit_init(struct v3dv_device *device);
void v3dv_meta_blit_finish(struct v3dv_device *device);

void v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device);
void v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device);

bool v3dv_meta_can_use_tlb(struct v3dv_image *image,
                           uint8_t plane,
                           uint8_t miplevel,
                           const VkOffset3D *offset,
                           const VkExtent3D *extent,
                           VkFormat *compat_format);

struct v3dv_instance {
   struct vk_instance vk;

   struct driOptionCache dri_options;
   struct driOptionCache available_dri_options;

   bool pipeline_cache_enabled;
   bool default_pipeline_cache_enabled;
   bool meta_cache_enabled;
};

/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd,
 * tfu), we still need a syncobj to track the last overall job submitted
 * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can
 * start expecting multisync to be present and drop the legacy implementation
 * together with this V3DV_QUEUE_ANY tracker.
 */
enum v3dv_queue_type {
   V3DV_QUEUE_CL = 0,
   V3DV_QUEUE_CSD,
   V3DV_QUEUE_TFU,
   V3DV_QUEUE_CPU,
   V3DV_QUEUE_ANY,
   V3DV_QUEUE_COUNT,
};

/* For each GPU queue, we use a syncobj to track the last job submitted. We
 * set the flag `first` to determine when we are starting a new cmd buffer
 * batch and therefore a job submitted to a given queue will be the first in a
 * cmd buf batch.
 */
struct v3dv_last_job_sync {
   /* If the job is the first submitted to a GPU queue in a cmd buffer batch.
    *
    * We use V3DV_QUEUE_{CL,CSD,TFU} both with and without multisync.
    */
   bool first[V3DV_QUEUE_COUNT];
   /* Array of syncobj to track the last job submitted to a GPU queue.
    *
    * With multisync we use V3DV_QUEUE_{CL,CSD,TFU} to track syncobjs for each
    * queue, but without multisync we only track the last job submitted to any
    * queue in V3DV_QUEUE_ANY.
    */
   uint32_t syncs[V3DV_QUEUE_COUNT];
};

struct v3dv_queue {
   struct vk_queue vk;

   struct v3dv_device *device;

   struct v3dv_last_job_sync last_job_syncs;

   struct v3dv_job *noop_job;

   /* The last active perfmon ID to prevent mixing of counter results when a
    * job is submitted with a different perfmon id.
    */
   uint32_t last_perfmon_id;
};

VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue,
                                  struct vk_queue_submit *submit);

#define V3DV_META_BLIT_CACHE_KEY_SIZE              (4 * sizeof(uint32_t))
#define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (3 * sizeof(uint32_t) + \
                                                    sizeof(VkComponentMapping))

struct v3dv_meta_color_clear_pipeline {
   VkPipeline pipeline;
   VkRenderPass pass;
   bool cached;
   uint64_t key;
};

struct v3dv_meta_depth_clear_pipeline {
   VkPipeline pipeline;
   uint64_t key;
};

struct v3dv_meta_blit_pipeline {
   VkPipeline pipeline;
   VkRenderPass pass;
   VkRenderPass pass_no_load;
   uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
};

struct v3dv_meta_texel_buffer_copy_pipeline {
   VkPipeline pipeline;
   VkRenderPass pass;
   VkRenderPass pass_no_load;
   uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
};

struct v3dv_device {
   struct vk_device vk;

   struct v3dv_instance *instance;
   struct v3dv_physical_device *pdevice;

   struct v3d_device_info devinfo;
   struct v3dv_queue queue;

   /* Guards query->maybe_available and value for timestamps */
   mtx_t query_mutex;

   /* Signaled whenever a query is ended */
   cnd_t query_ended;

   /* Resources used for meta operations */
   struct {
      mtx_t mtx;
      struct {
         VkPipelineLayout p_layout;
         struct hash_table *cache; /* v3dv_meta_color_clear_pipeline */
      } color_clear;
      struct {
         VkPipelineLayout p_layout;
         struct hash_table *cache; /* v3dv_meta_depth_clear_pipeline */
      } depth_clear;
      struct {
         VkDescriptorSetLayout ds_layout;
         VkPipelineLayout p_layout;
         struct hash_table *cache[3]; /* v3dv_meta_blit_pipeline for 1d, 2d, 3d */
      } blit;
      struct {
         VkDescriptorSetLayout ds_layout;
         VkPipelineLayout p_layout;
         struct hash_table *cache[3]; /* v3dv_meta_texel_buffer_copy_pipeline for 1d, 2d, 3d */
      } texel_buffer_copy;
   } meta;

   struct v3dv_bo_cache {
      /** List of struct v3d_bo freed, by age. */
      struct list_head time_list;
      /** List of struct v3d_bo freed, per size, by age. */
      struct list_head *size_list;
      uint32_t size_list_size;

      mtx_t lock;

      uint32_t cache_size;
      uint32_t cache_count;
      uint32_t max_cache_size;
   } bo_cache;

   uint32_t bo_size;
   uint32_t bo_count;

   /* Event handling resources.
    *
    * Our implementation of events uses a BO to store event state (signaled vs
    * reset) and dispatches compute shaders to handle GPU event functions
    * (signal, reset, wait). This struct holds all the resources required
    * by the implementation.
    */
   struct {
      mtx_t lock;

      /* BO for the event states: signaled (1) or reset (0) */
      struct v3dv_bo *bo;

      /* We pre-allocate all the events we can fit for the size of the BO we
       * create to track their states, where each event has an index which is
       * basically the offset of its state in that BO. We keep a free list with
       * the pre-allocated events that are available.
       */
      uint32_t event_count;
      struct v3dv_event *events;
      struct list_head free_list;

      /* Vulkan resources to access the event BO from shaders. We have a
       * pipeline that sets the state of an event and another that waits on
       * a single event. Both pipelines require access to the event state BO,
       * for which we need to allocate a single descripot set.
       */
      VkBuffer buffer;
      VkDeviceMemory mem;
      VkDescriptorSetLayout descriptor_set_layout;
      VkPipelineLayout pipeline_layout;
      VkDescriptorPool descriptor_pool;
      VkDescriptorSet descriptor_set;
      VkPipeline set_event_pipeline;
      VkPipeline wait_event_pipeline;
   } events;

   /* Query handling resources.
    *
    * Our implementation of occlusion queries uses a BO per pool to keep track
    * of the per-query availability state and dispatches compute shaders to
    * handle GPU query functions that read and write that state. This struct
    * holds Vulkan resources that can be shared across all query pools to
    * implement this. This framework may be extended in the future to handle
    * more query types.
    */
   struct {
      VkDescriptorSetLayout buf_descriptor_set_layout;

      /* Set query availability */
      VkPipelineLayout avail_pipeline_layout;
      VkPipeline avail_pipeline;

      /* Reset query availability and clear occlusion counters */
      VkPipelineLayout reset_occlusion_pipeline_layout;
      VkPipeline reset_occlusion_pipeline;

      /* Copy query results */
      VkPipelineLayout copy_pipeline_layout;
      VkPipeline copy_pipeline[8];
   } queries;

   struct v3dv_pipeline_cache default_pipeline_cache;

   /* GL_SHADER_STATE_RECORD needs to specify default attribute values. The
    * following covers the most common case, that is all attributes format
    * being float being float, allowing us to reuse the same BO for all
    * pipelines matching this requirement. Pipelines that need integer
    * attributes will create their own BO.
    *
    * Note that since v71 the default attribute values are not needed, so this
    * can be NULL.
    */
   struct v3dv_bo *default_attribute_float;

   void *device_address_mem_ctx;
   struct util_dynarray device_address_bo_list; /* Array of struct v3dv_bo * */
};

struct v3dv_device_memory {
   struct vk_device_memory vk;

   struct v3dv_bo *bo;
   const VkMemoryType *type;
   bool is_for_wsi;
   bool is_for_device_address;
};

uint32_t v3dv_physical_device_vendor_id(const struct v3dv_physical_device *dev);
uint32_t v3dv_physical_device_device_id(const struct v3dv_physical_device *dev);

static inline bool
v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device)
{
   return device->devinfo.ver > 71 ||
          (device->devinfo.ver == 71 && device->devinfo.rev >= 5);
}

VK_DEFINE_HANDLE_CASTS(v3dv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
VK_DEFINE_HANDLE_CASTS(v3dv_instance, vk.base, VkInstance,
                       VK_OBJECT_TYPE_INSTANCE)
VK_DEFINE_HANDLE_CASTS(v3dv_physical_device, vk.base, VkPhysicalDevice,
                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
VK_DEFINE_HANDLE_CASTS(v3dv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, vk.base, VkDeviceMemory,
                               VK_OBJECT_TYPE_DEVICE_MEMORY)

#endif /* V3DV_DEVICE_H */