From 34494f6c5b10ee8d4e3eeb78482cf4fd6b1669f6 Mon Sep 17 00:00:00 2001
From: Yogesh Mohan Marimuthu <yogesh.mohanmarimuthu@amd.com>
Date: Tue, 14 Jun 2022 18:48:53 +0530
Subject: [PATCH] amdgpu: add amdgpu_cs_submit_gang api

The amdgpu_cs_submit_gang api can be used to submit ibs
from different HW IP as single entity.

Signed-off-by: Yogesh Mohan Marimuthu <yogesh.mohanmarimuthu@amd.com>
Reviewed-by: Vitaly Prosyak <vitaly.prosyak@amd.com>
---
 amdgpu/amdgpu-symbols.txt |  1 +
 amdgpu/amdgpu.h           | 75 +++++++++++++++++++++++++++++++++++++++
 amdgpu/amdgpu_cs.c        | 45 ++++++++++++++++++-----
 3 files changed, 113 insertions(+), 8 deletions(-)

diff --git a/amdgpu/amdgpu-symbols.txt b/amdgpu/amdgpu-symbols.txt
index 530b343b..9b95badd 100644
--- a/amdgpu/amdgpu-symbols.txt
+++ b/amdgpu/amdgpu-symbols.txt
@@ -77,3 +77,4 @@ amdgpu_va_get_start_addr
 amdgpu_va_range_query
 amdgpu_vm_reserve_vmid
 amdgpu_vm_unreserve_vmid
+amdgpu_cs_submit_gang
\ No newline at end of file
diff --git a/amdgpu/amdgpu.h b/amdgpu/amdgpu.h
index 9bdbf366..50c23ef8 100644
--- a/amdgpu/amdgpu.h
+++ b/amdgpu/amdgpu.h
@@ -306,6 +306,36 @@ struct amdgpu_cs_ib_info {
 	uint32_t size;
 };
 
+/**
+ * Structure describing gang IB, used for submitting ib's on multiple HW IP.
+ *
+ * \sa amdgpu_cs_request, amdgpu_cs_submit_gang()
+ *
+*/
+struct amdgpu_cs_ib_info_gang {
+	/** Special flags */
+	uint64_t flags;
+
+	/** Virtual MC address of the command buffer */
+	uint64_t ib_mc_address;
+
+	/**
+	 * Size of Command Buffer to be submitted.
+	 *   - The size is in units of dwords (4 bytes).
+	 *   - Could be 0
+	 */
+	uint32_t size;
+
+	/** To which HW IP type the ip belongs */
+	uint32_t ip_type;
+
+	/** IP instance index if there are several IPs of the same type. */
+	uint32_t ip_instance;
+
+	/** Ring index of the HW IP */
+	uint32_t ring;
+};
+
 /**
  * Structure describing fence information
  *
@@ -377,6 +407,12 @@ struct amdgpu_cs_request {
 	 * The fence information
 	 */
 	struct amdgpu_cs_fence_info fence_info;
+
+	/**
+	 * Use below *ibs_gang instead of *ibs for gang submission. Gang submission
+	 * allows ib from different HW IP to be submitted as single entity.
+	 */
+	struct amdgpu_cs_ib_info_gang *ibs_gang;
 };
 
 /**
@@ -1031,6 +1067,45 @@ int amdgpu_cs_submit(amdgpu_context_handle context,
 		     struct amdgpu_cs_request *ibs_request,
 		     uint32_t number_of_requests);
 
+/**
+ * Send request to submit command buffers to hardware.
+ *
+ * Kernel driver could use GPU Scheduler to make decision when physically
+ * sent this request to the hardware. Accordingly this request could be put
+ * in queue and sent for execution later. The only guarantee is that request
+ * from the same GPU context will be executed in order.
+ *
+ * The caller can specify the user fence buffer/location with the fence_info in the
+ * cs_request.The sequence number is returned via the 'seq_no' parameter
+ * in ibs_request structure.
+ *
+ *
+ * \param   dev		       - \c [in]  Device handle.
+ *					  See #amdgpu_device_initialize()
+ * \param   context            - \c [in]  GPU Context
+ * \param   flags              - \c [in]  Global submission flags
+ * \param   ibs_request        - \c [in/out] Pointer to submission requests.
+ *					  We could submit to the several
+ *					  engines/rings simulteniously as
+ *					  'atomic' operation
+ * \param   number_of_requests - \c [in]  Number of submission requests
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \note It is required to pass correct resource list with buffer handles
+ *	 which will be accessible by command buffers from submission
+ *	 This will allow kernel driver to correctly implement "paging".
+ *	 Failure to do so will have unpredictable results.
+ *
+ * \sa amdgpu_cs_query_fence_status()
+ *
+*/
+int amdgpu_cs_submit_gang(amdgpu_context_handle context,
+		     uint64_t flags,
+		     struct amdgpu_cs_request *ibs_request,
+		     uint32_t number_of_requests);
+
 /**
  *  Query status of Command Buffer Submission
  *
diff --git a/amdgpu/amdgpu_cs.c b/amdgpu/amdgpu_cs.c
index 49fc16c3..13b7e0ec 100644
--- a/amdgpu/amdgpu_cs.c
+++ b/amdgpu/amdgpu_cs.c
@@ -246,13 +246,13 @@ drm_public int amdgpu_cs_query_reset_state2(amdgpu_context_handle context,
  * \param   dev - \c [in]  Device handle
  * \param   context - \c [in]  GPU Context
  * \param   ibs_request - \c [in]  Pointer to submission requests
- * \param   fence - \c [out] return fence for this submission
+ * \param   gang  - \c [in] if true different IP ib's can be passed
  *
  * \return  0 on success otherwise POSIX Error code
  * \sa amdgpu_cs_submit()
 */
 static int amdgpu_cs_submit_one(amdgpu_context_handle context,
-				struct amdgpu_cs_request *ibs_request)
+                                struct amdgpu_cs_request *ibs_request, int gang)
 {
 	struct drm_amdgpu_cs_chunk *chunks;
 	struct drm_amdgpu_cs_chunk_data *chunk_data;
@@ -289,19 +289,26 @@ static int amdgpu_cs_submit_one(amdgpu_context_handle context,
 	num_chunks = ibs_request->number_of_ibs;
 	/* IB chunks */
 	for (i = 0; i < ibs_request->number_of_ibs; i++) {
-		struct amdgpu_cs_ib_info *ib;
+		struct amdgpu_cs_ib_info_gang *ib;
 		chunks[i].chunk_id = AMDGPU_CHUNK_ID_IB;
 		chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
 		chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
 
-		ib = &ibs_request->ibs[i];
+		if (gang) {
+			ib = &ibs_request->ibs_gang[i];
+			chunk_data[i].ib_data.ip_type = ib->ip_type;
+			chunk_data[i].ib_data.ip_instance = ib->ip_instance;
+			chunk_data[i].ib_data.ring = ib->ring;
+		} else {
+			ib = (struct amdgpu_cs_ib_info_gang*)&ibs_request->ibs[i];
+			chunk_data[i].ib_data.ip_type = ibs_request->ip_type;
+			chunk_data[i].ib_data.ip_instance = ibs_request->ip_instance;
+			chunk_data[i].ib_data.ring = ibs_request->ring;
+		}
 
 		chunk_data[i].ib_data._pad = 0;
 		chunk_data[i].ib_data.va_start = ib->ib_mc_address;
 		chunk_data[i].ib_data.ib_bytes = ib->size * 4;
-		chunk_data[i].ib_data.ip_type = ibs_request->ip_type;
-		chunk_data[i].ib_data.ip_instance = ibs_request->ip_instance;
-		chunk_data[i].ib_data.ring = ibs_request->ring;
 		chunk_data[i].ib_data.flags = ib->flags;
 	}
 
@@ -405,7 +412,29 @@ drm_public int amdgpu_cs_submit(amdgpu_context_handle context,
 
 	r = 0;
 	for (i = 0; i < number_of_requests; i++) {
-		r = amdgpu_cs_submit_one(context, ibs_request);
+		r = amdgpu_cs_submit_one(context, ibs_request, false);
+		if (r)
+			break;
+		ibs_request++;
+	}
+
+	return r;
+}
+
+drm_public int amdgpu_cs_submit_gang(amdgpu_context_handle context,
+				uint64_t flags,
+				struct amdgpu_cs_request *ibs_request,
+				uint32_t number_of_requests)
+{
+	uint32_t i;
+	int r;
+
+	if (!context || !ibs_request)
+		return -EINVAL;
+
+	r = 0;
+	for (i = 0; i < number_of_requests; i++) {
+		r = amdgpu_cs_submit_one(context, ibs_request, true);
 		if (r)
 			break;
 		ibs_request++;