@@ -324,7 +324,13 @@ simulate_gpu_task(struct rte_gpu_comm_list *comm_list_item, int num_pkts)
* consume(comm_list_item->pkt_list[idx].addr);
*/
}
- comm_list_item->status = RTE_GPU_COMM_LIST_DONE;
+ /*
+ * A real GPU workload function can't directly call rte_gpu_comm_set_status
+ * because it's a CPU-only function.
+ * A real GPU workload should implement the content
+ * of rte_gpu_comm_set_status() in GPU specific code.
+ */
+ rte_gpu_comm_set_status(comm_list_item, RTE_GPU_COMM_LIST_DONE);
return 0;
}
@@ -216,7 +216,7 @@ about how to use functions in this library in case of a CUDA application.
/* GPU kernel keeps checking this flag to know if it has to quit or wait for more packets. */
while (*quit_flag_ptr == 0) {
- if (comm_list[comm_list_index]->status != RTE_GPU_COMM_LIST_READY)
+ if (comm_list[comm_list_index]->status_d[0] != RTE_GPU_COMM_LIST_READY)
continue;
if (threadIdx.x < comm_list[comm_list_index]->num_pkts)
@@ -10,6 +10,7 @@
#include <rte_malloc.h>
#include <rte_errno.h>
#include <rte_log.h>
+#include <rte_eal_paging.h>
#include "rte_gpudev.h"
#include "gpudev_driver.h"
@@ -847,6 +848,46 @@ rte_gpu_comm_create_list(uint16_t dev_id,
return NULL;
}
+ /*
+ * Use GPU memory CPU map feature if enabled in the driver
+ * to allocate the status flags of the list.
+ * Allocating this flag in GPU memory will reduce
+ * the latency when GPU workload is polling this flag.
+ */
+ comm_list[0].status_d = rte_gpu_mem_alloc(dev_id,
+ sizeof(enum rte_gpu_comm_list_status) * num_comm_items,
+ rte_mem_page_size());
+ if (ret < 0) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+
+ comm_list[0].status_h = rte_gpu_mem_cpu_map(dev_id,
+ sizeof(enum rte_gpu_comm_list_status) * num_comm_items,
+ comm_list[0].status_d);
+ if (comm_list[0].status_h == NULL) {
+ /*
+ * If CPU mapping is not supported by driver
+ * use regular CPU registered memory.
+ */
+ comm_list[0].status_h = rte_zmalloc(NULL,
+ sizeof(enum rte_gpu_comm_list_status) * num_comm_items, 0);
+ if (comm_list[0].status_h == NULL) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+
+ ret = rte_gpu_mem_register(dev_id,
+ sizeof(enum rte_gpu_comm_list_status) * num_comm_items,
+ comm_list[0].status_h);
+ if (ret < 0) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+
+ comm_list[0].status_d = comm_list[0].status_h;
+ }
+
for (idx_l = 0; idx_l < num_comm_items; idx_l++) {
comm_list[idx_l].pkt_list = rte_zmalloc(NULL,
sizeof(struct rte_gpu_comm_pkt) * RTE_GPU_COMM_LIST_PKTS_MAX, 0);
@@ -862,8 +903,7 @@ rte_gpu_comm_create_list(uint16_t dev_id,
rte_errno = ENOMEM;
return NULL;
}
-
- RTE_GPU_VOLATILE(comm_list[idx_l].status) = RTE_GPU_COMM_LIST_FREE;
+
comm_list[idx_l].num_pkts = 0;
comm_list[idx_l].dev_id = dev_id;
@@ -873,6 +913,17 @@ rte_gpu_comm_create_list(uint16_t dev_id,
rte_errno = ENOMEM;
return NULL;
}
+
+ if (idx_l > 0) {
+ comm_list[idx_l].status_h = &(comm_list[0].status_h[idx_l]);
+ comm_list[idx_l].status_d = &(comm_list[0].status_d[idx_l]);
+
+ ret = rte_gpu_comm_set_status(&comm_list[idx_l], RTE_GPU_COMM_LIST_FREE);
+ if (ret < 0) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ }
}
return comm_list;
@@ -910,6 +961,14 @@ rte_gpu_comm_destroy_list(struct rte_gpu_comm_list *comm_list,
return -1;
}
+ ret = rte_gpu_mem_cpu_unmap(dev_id, comm_list[0].status_d);
+ if(ret == 0) {
+ rte_gpu_mem_free(dev_id, comm_list[0].status_d);
+ } else {
+ rte_gpu_mem_unregister(dev_id, comm_list[0].status_h);
+ rte_free(comm_list[0].status_h);
+ }
+
rte_free(comm_list);
return 0;
@@ -920,6 +979,7 @@ rte_gpu_comm_populate_list_pkts(struct rte_gpu_comm_list *comm_list_item,
struct rte_mbuf **mbufs, uint32_t num_mbufs)
{
uint32_t idx;
+ int ret;
if (comm_list_item == NULL || comm_list_item->pkt_list == NULL ||
mbufs == NULL || num_mbufs > RTE_GPU_COMM_LIST_PKTS_MAX) {
@@ -943,7 +1003,39 @@ rte_gpu_comm_populate_list_pkts(struct rte_gpu_comm_list *comm_list_item,
RTE_GPU_VOLATILE(comm_list_item->num_pkts) = num_mbufs;
rte_gpu_wmb(comm_list_item->dev_id);
- RTE_GPU_VOLATILE(comm_list_item->status) = RTE_GPU_COMM_LIST_READY;
+ ret = rte_gpu_comm_set_status(comm_list_item, RTE_GPU_COMM_LIST_READY);
+ if (ret < 0) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+
+ return 0;
+}
+
+int
+rte_gpu_comm_set_status(struct rte_gpu_comm_list *comm_list_item,
+ enum rte_gpu_comm_list_status status)
+{
+ if (comm_list_item == NULL) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+
+ RTE_GPU_VOLATILE(comm_list_item->status_h[0]) = status;
+
+ return 0;
+}
+
+int
+rte_gpu_comm_get_status(struct rte_gpu_comm_list *comm_list_item,
+ enum rte_gpu_comm_list_status *status)
+{
+ if (comm_list_item == NULL || status == NULL) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+
+ *status = RTE_GPU_VOLATILE(comm_list_item->status_h[0]);
return 0;
}
@@ -952,14 +1044,21 @@ int
rte_gpu_comm_cleanup_list(struct rte_gpu_comm_list *comm_list_item)
{
uint32_t idx = 0;
+ enum rte_gpu_comm_list_status status;
+ int ret;
if (comm_list_item == NULL) {
rte_errno = EINVAL;
return -rte_errno;
}
- if (RTE_GPU_VOLATILE(comm_list_item->status) ==
- RTE_GPU_COMM_LIST_READY) {
+ ret = rte_gpu_comm_get_status(comm_list_item, &status);
+ if (ret < 0) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+
+ if (status == RTE_GPU_COMM_LIST_READY) {
GPU_LOG(ERR, "packet list is still in progress");
rte_errno = EINVAL;
return -rte_errno;
@@ -974,9 +1073,14 @@ rte_gpu_comm_cleanup_list(struct rte_gpu_comm_list *comm_list_item)
comm_list_item->mbufs[idx] = NULL;
}
- RTE_GPU_VOLATILE(comm_list_item->status) = RTE_GPU_COMM_LIST_FREE;
+ ret = rte_gpu_comm_set_status(comm_list_item, RTE_GPU_COMM_LIST_FREE);
+ if (ret < 0) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
RTE_GPU_VOLATILE(comm_list_item->num_pkts) = 0;
rte_mb();
return 0;
}
+
@@ -124,8 +124,10 @@ struct rte_gpu_comm_list {
struct rte_gpu_comm_pkt *pkt_list;
/** Number of packets in the list. */
uint32_t num_pkts;
- /** Status of the list. */
- enum rte_gpu_comm_list_status status;
+ /** Status of the list. CPU pointer. */
+ enum rte_gpu_comm_list_status *status_h;
+ /** Status of the list. GPU pointer. */
+ enum rte_gpu_comm_list_status *status_d;
};
/**
@@ -489,7 +491,7 @@ void *rte_gpu_mem_cpu_map(int16_t dev_id, size_t size, void *ptr);
* @param dev_id
* Reference device ID.
* @param ptr
- * Pointer to the GPU memory area to be unmapped.
+ * Pointer to the memory area to be unmapped.
* NULL is a no-op accepted value.
*
* @return
@@ -679,6 +681,46 @@ __rte_experimental
int rte_gpu_comm_populate_list_pkts(struct rte_gpu_comm_list *comm_list_item,
struct rte_mbuf **mbufs, uint32_t num_mbufs);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Set status flag value of a communication list item.
+ *
+ * @param comm_list_item
+ * Communication list item to query.
+ * @param status
+ * Status value to set.
+ *
+ * @return
+ * 0 on success, -rte_errno otherwise:
+ * - EINVAL if invalid input params
+ */
+__rte_experimental
+int rte_gpu_comm_set_status(struct rte_gpu_comm_list *comm_list_item,
+ enum rte_gpu_comm_list_status status);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get status flag value of a communication list item.
+ *
+ * @param comm_list_item
+ * Communication list item to query.
+ * Input parameter.
+ * @param status
+ * Communication list item status flag value.
+ * Output parameter.
+ *
+ * @return
+ * 0 on success, -rte_errno otherwise:
+ * - EINVAL if invalid input params
+ */
+__rte_experimental
+int rte_gpu_comm_get_status(struct rte_gpu_comm_list *comm_list_item,
+ enum rte_gpu_comm_list_status *status);
+
/**
* @warning
* @b EXPERIMENTAL: this API may change without prior notice.
@@ -701,3 +743,4 @@ int rte_gpu_comm_cleanup_list(struct rte_gpu_comm_list *comm_list_item);
#endif
#endif /* RTE_GPUDEV_H */
+
@@ -12,8 +12,10 @@ EXPERIMENTAL {
rte_gpu_comm_destroy_flag;
rte_gpu_comm_destroy_list;
rte_gpu_comm_get_flag_value;
+ rte_gpu_comm_get_status;
rte_gpu_comm_populate_list_pkts;
rte_gpu_comm_set_flag;
+ rte_gpu_comm_set_status;
rte_gpu_count_avail;
rte_gpu_find_next;
rte_gpu_info_get;