[dpdk-dev] [RFC 6/6] virtio1.1: introduce the DESC_WB flag

Tiwei Bie tiwei.bie at intel.com
Tue Jul 18 17:40:21 CEST 2017


This patch introduces the DESC_WB flag, and adopts it in the virtio
Tx path. This flag can help to reduce the memory write operations on
the descriptor ring from the backend. So it could help to reduce the
PCIe transactions in a hardware backend case.

Normally the DESC_WB like mechanism is used in a head/tail design.
But in this prototype, head/tail isn't used. So backend still needs
to judge whether a desc is available by checking the DESC_HW flag.
Then some tricks are needed by the frontend when setting the DESC_WB
flags. Otherwise backend may see some descriptors' DESC_HW are set,
but actually they are not available to the backend.

Signed-off-by: Tiwei Bie <tiwei.bie at intel.com>
---
 drivers/net/virtio/virtio-1.1.h      |  1 +
 drivers/net/virtio/virtio_rxtx.c     | 14 +++++++++
 drivers/net/virtio/virtio_rxtx_1.1.c | 55 +++++++++++++++++++++++++-----------
 drivers/net/virtio/virtqueue.h       |  2 ++
 lib/librte_vhost/virtio-1.1.h        |  1 +
 lib/librte_vhost/virtio_net.c        | 37 +++++++++++++++---------
 6 files changed, 79 insertions(+), 31 deletions(-)

diff --git a/drivers/net/virtio/virtio-1.1.h b/drivers/net/virtio/virtio-1.1.h
index 48cbb18..bd76761 100644
--- a/drivers/net/virtio/virtio-1.1.h
+++ b/drivers/net/virtio/virtio-1.1.h
@@ -8,6 +8,7 @@
 #define BATCH_NOT_FIRST 0x0010
 #define BATCH_NOT_LAST  0x0020
 #define DESC_HW		0x0080
+#define DESC_WB		0x0100
 
 struct vring_desc_1_1 {
         uint64_t addr;
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 7be9aae..0248638 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -153,6 +153,10 @@ virtqueue_dequeue_burst_rx(struct virtqueue *vq, struct rte_mbuf **rx_pkts,
 #define DEFAULT_TX_FREE_THRESH 32
 #endif
 
+#ifndef DEFAULT_TX_RS_THRESH
+#define DEFAULT_TX_RS_THRESH 32
+#endif
+
 /* Cleanup from completed transmits. */
 static void
 virtio_xmit_cleanup(struct virtqueue *vq, uint16_t num)
@@ -434,6 +438,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
 	struct virtnet_tx *txvq;
 	uint16_t tx_free_thresh;
+	uint16_t tx_rs_thresh;
 	uint16_t desc_idx;
 
 	PMD_INIT_FUNC_TRACE();
@@ -451,6 +456,9 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	if (tx_free_thresh == 0)
 		tx_free_thresh =
 			RTE_MIN(vq->vq_nentries / 4, DEFAULT_TX_FREE_THRESH);
+	tx_rs_thresh = tx_conf->tx_rs_thresh;
+	if (tx_rs_thresh == 0)
+		tx_rs_thresh = DEFAULT_TX_RS_THRESH;
 
 	if (tx_free_thresh >= (vq->vq_nentries - 3)) {
 		RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the "
@@ -461,7 +469,13 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
+	if (!rte_is_power_of_2(tx_free_thresh)) {
+		RTE_LOG(ERR, PMD, "tx_free_thresh is not powerof 2");
+		return -EINVAL;
+	}
+
 	vq->vq_free_thresh = tx_free_thresh;
+	vq->vq_rs_thresh = tx_rs_thresh;
 
 	if (hw->use_simple_rxtx) {
 		uint16_t mid_idx  = vq->vq_nentries >> 1;
diff --git a/drivers/net/virtio/virtio_rxtx_1.1.c b/drivers/net/virtio/virtio_rxtx_1.1.c
index 883a027..38f6a4a 100644
--- a/drivers/net/virtio/virtio_rxtx_1.1.c
+++ b/drivers/net/virtio/virtio_rxtx_1.1.c
@@ -63,21 +63,28 @@
 #include "virtio_rxtx.h"
 
 /* Cleanup from completed transmits. */
-static void
+static inline int
 virtio_xmit_cleanup(struct virtqueue *vq)
 {
-	uint16_t idx;
-	uint16_t size = vq->vq_nentries;
+	uint16_t clean_to, idx;
+	uint16_t mask = vq->vq_nentries - 1;
 	struct vring_desc_1_1 *desc = vq->vq_ring.desc_1_1;
+	uint16_t last_cleaned = vq->vq_used_cons_idx - 1;
 
-	idx = vq->vq_used_cons_idx & (size - 1);
-	while ((desc[idx].flags & DESC_HW) == 0) {
-		idx = (++vq->vq_used_cons_idx) & (size - 1);
-		vq->vq_free_cnt++;
-
-		if (vq->vq_free_cnt >= size)
-			break;
+	clean_to = last_cleaned + vq->vq_rs_thresh;
+	if ((desc[clean_to & mask].flags & DESC_HW) != 0) {
+		PMD_TX_LOG(DEBUG, "TX descriptor %d is not done",
+			clean_to & mask);
+		return -1;
 	}
+
+	for (idx = last_cleaned + 2; idx < clean_to; idx++)
+		desc[idx & mask].flags &= ~DESC_HW;
+
+	vq->vq_used_cons_idx = clean_to + 1;
+	vq->vq_free_cnt += vq->vq_rs_thresh;
+
+	return 0;
 }
 
 uint16_t
@@ -90,6 +97,7 @@ virtio_xmit_pkts_1_1(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts
 	struct vring_desc_1_1 *desc = vq->vq_ring.desc_1_1;
 	uint16_t idx;
 	struct vq_desc_extra *dxp;
+	uint16_t nb_needed, nb_used = vq->vq_nb_used;
 
 	if (unlikely(nb_pkts < 1))
 		return nb_pkts;
@@ -103,19 +111,23 @@ virtio_xmit_pkts_1_1(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts
 		struct rte_mbuf *txm = tx_pkts[i];
 		struct virtio_tx_region *txr = txvq->virtio_net_hdr_mz->addr;
 
-		if (unlikely(txm->nb_segs + 1 > vq->vq_free_cnt)) {
-			virtio_xmit_cleanup(vq);
+		nb_needed = txm->nb_segs + 1;
 
-			if (unlikely(txm->nb_segs + 1 > vq->vq_free_cnt)) {
-				PMD_TX_LOG(ERR,
-					   "No free tx descriptors to transmit");
-				break;
+		if (unlikely(nb_needed > vq->vq_free_cnt)) {
+			if (unlikely(virtio_xmit_cleanup(vq) != 0))
+				goto end_of_tx;
+
+			if (unlikely(nb_needed > vq->vq_rs_thresh)) {
+				while (nb_needed > vq->vq_free_cnt) {
+					if (virtio_xmit_cleanup(vq) != 0)
+						goto end_of_tx;
+				}
 			}
 		}
 
 		txvq->stats.bytes += txm->pkt_len;
 
-		vq->vq_free_cnt -= txm->nb_segs + 1;
+		vq->vq_free_cnt -= nb_needed;
 
 		idx = (vq->vq_avail_idx++) & (vq->vq_nentries - 1);
 		dxp = &vq->vq_descx[idx];
@@ -129,22 +141,31 @@ virtio_xmit_pkts_1_1(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts
 		desc[idx].flags = VRING_DESC_F_NEXT;
 		if (i != 0)
 			desc[idx].flags |= DESC_HW;
+		nb_used = (nb_used + 1) & ~vq->vq_rs_thresh;
+		if (nb_used == 0 || nb_used == 1)
+			desc[idx].flags |= DESC_WB;
 
 		do {
 			idx = (vq->vq_avail_idx++) & (vq->vq_nentries - 1);
 			desc[idx].addr  = VIRTIO_MBUF_DATA_DMA_ADDR(txm, vq);
 			desc[idx].len   = txm->data_len;
 			desc[idx].flags = DESC_HW | VRING_DESC_F_NEXT;
+			nb_used = (nb_used + 1) & ~vq->vq_rs_thresh;
+			if (nb_used == 0 || nb_used == 1)
+				desc[idx].flags |= DESC_WB;
 		} while ((txm = txm->next) != NULL);
 
 		desc[idx].flags &= ~VRING_DESC_F_NEXT;
 	}
 
+end_of_tx:
 	if (likely(i)) {
 		rte_smp_wmb();
 		vq->vq_ring.desc_1_1[head_idx & (vq->vq_nentries - 1)].flags |= DESC_HW;
 	}
 
+	vq->vq_nb_used = nb_used;
+
 	txvq->stats.packets += i;
 	txvq->stats.errors  += nb_pkts - i;
 
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 45f49d7..b104afa 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -194,6 +194,8 @@ struct virtqueue {
 	uint16_t vq_free_cnt;  /**< num of desc available */
 	uint16_t vq_avail_idx; /**< sync until needed */
 	uint16_t vq_free_thresh; /**< free threshold */
+	uint16_t vq_rs_thresh; /**< RS threshold */
+	uint16_t vq_nb_used ;  /**< num of unreported desc */
 
 	void *vq_ring_virt_mem;  /**< linear address of vring*/
 	unsigned int vq_ring_size;
diff --git a/lib/librte_vhost/virtio-1.1.h b/lib/librte_vhost/virtio-1.1.h
index 4241d0a..91e21f1 100644
--- a/lib/librte_vhost/virtio-1.1.h
+++ b/lib/librte_vhost/virtio-1.1.h
@@ -12,6 +12,7 @@
 #define BATCH_NOT_FIRST 0x0010
 #define BATCH_NOT_LAST  0x0020
 #define DESC_HW		0x0080
+#define DESC_WB		0x0100
 
 struct vring_desc_1_1 {
         __le64 addr;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 08d53d9..96b84e5 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1100,7 +1100,8 @@ mbuf_is_consumed(struct rte_mbuf *m)
 static inline uint16_t __attribute__((always_inline))
 dequeue_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	     struct rte_mempool *mbuf_pool, struct rte_mbuf *m,
-	     struct vring_desc_1_1 *descs, uint16_t *desc_idx)
+	     struct vring_desc_1_1 *descs, uint16_t *desc_idx,
+	     struct vring_desc_1_1 **dps, uint16_t *_cnt)
 {
 	struct vring_desc_1_1 *desc;
 	uint64_t desc_addr;
@@ -1110,6 +1111,7 @@ dequeue_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mbuf *cur = m, *prev = m;
 	struct virtio_net_hdr *hdr = NULL;
 	uint16_t head_idx = *desc_idx;
+	uint16_t cnt = *_cnt;
 
 	desc = &descs[(head_idx++) & (vq->size - 1)];
 	if (unlikely((desc->len < dev->vhost_hlen)) ||
@@ -1132,6 +1134,9 @@ dequeue_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	 */
 	if (likely((desc->len == dev->vhost_hlen) &&
 		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
+		if (desc->flags & DESC_WB)
+			dps[cnt++] = desc;
+
 		desc = &descs[(head_idx++) & (vq->size - 1)];
 		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
 			return -1;
@@ -1172,6 +1177,9 @@ dequeue_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
 				break;
 
+			if (desc->flags & DESC_WB)
+				dps[cnt++] = desc;
+
 			desc = &descs[(head_idx++) & (vq->size - 1)];
 			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
 				return -1;
@@ -1217,7 +1225,11 @@ dequeue_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	if (hdr)
 		vhost_dequeue_offload(hdr, m);
 
+	if (desc->flags & DESC_WB)
+		dps[cnt++] = desc;
+
 	*desc_idx = head_idx;
+	*_cnt = cnt;
 
 	return 0;
 }
@@ -1229,7 +1241,8 @@ vhost_dequeue_burst_1_1(struct virtio_net *dev, struct vhost_virtqueue *vq,
 {
 	uint16_t i;
 	struct vring_desc_1_1 *desc = vq->desc_1_1;
-	uint16_t head_idx = vq->last_used_idx;
+	struct vring_desc_1_1 *dps[64];
+	uint16_t cnt = 0;
 	uint16_t desc_idx;
 	int err;
 
@@ -1250,26 +1263,22 @@ vhost_dequeue_burst_1_1(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			break;
 		}
 
-		err = dequeue_desc(dev, vq, mbuf_pool, pkts[i], desc, &desc_idx);
+		err = dequeue_desc(dev, vq, mbuf_pool, pkts[i], desc,
+				   &desc_idx, dps, &cnt);
 		if (unlikely(err)) {
 			rte_pktmbuf_free(pkts[i]);
 			break;
 		}
 	}
+	count = i;
 
 	vq->last_used_idx = desc_idx;
-	if (likely(i)) {
-		rte_prefetch0(&desc[head_idx & (vq->size - 1)]);
-		for (desc_idx = head_idx + 1;
-		     desc_idx != vq->last_used_idx;
-		     desc_idx++) {
-			desc[desc_idx & (vq->size - 1)].flags = 0;
-		}
-		rte_smp_wmb();
-		desc[head_idx & (vq->size - 1)].flags = 0;
-	}
 
-	return i;
+	rte_smp_wmb();
+	for (i = 0; i < cnt; i++)
+		dps[i]->flags = 0;
+
+	return count;
 }
 
 uint16_t
-- 
2.7.4



More information about the dev mailing list