[dpdk-dev] [PATCH v3] virtio: Support mergeable buffer in virtio pmd

Fu, JingguoX jingguox.fu at intel.com
Wed Sep 24 11:22:09 CEST 2014


Tested-by: Jingguo Fu <jingguox.fu at intel.com>

This patch includes 1 files, and has been tested by Intel.
Please see information as the following:

Host:
Fedora 19 x86_64, Linux Kernel 3.9.0, GCC 4.8.2  Intel Xeon CPU E5-2680 v2 @ 2.80GHz
 NIC: Intel Niantic 82599, Intel i350, Intel 82580 and Intel 82576

Guest:
Fedora 16 x86_64, Linux Kernel 3.4.2, GCC 4.6.3 Qemu emulator 1.4.2

This patch tests with user space vhost driver library patch.
We verified zero copy and one copy test cases for functional and performance.

This patch depend on the two patches:
[dpdk-dev] [PATCH] virtio: Update max RX packet length http://www.dpdk.org/ml/archives/dev/2014-September/005107.html
[dpdk-dev] [PATCH] virtio: Fix vring entry number issue http://www.dpdk.org/ml/archives/dev/2014-September/005170.html


-----Original Message-----
From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Ouyang Changchun
Sent: Thursday, August 14, 2014 16:55
To: dev at dpdk.org
Subject: [dpdk-dev] [PATCH v3] virtio: Support mergeable buffer in virtio pmd

v3 change:
- Investigate the comments from Huawei and fix one potential issue of wrong offset to
  the number of descriptor in buffer; also fix other tiny comments.

v2 change:
- Resolve conflicts with the tip code;
- And resolve 2 issues:
   -- fix mbuf leak when discard an uncompleted packet.
   -- refine pkt.data to point to actual payload data start point.
 
v1 change:
- This patch supports mergeable buffer feature in DPDK based virtio PMD, which can
  receive jumbo frame with larger size, like 3K, 4K or even 9K.

Signed-off-by: Changchun Ouyang <changchun.ouyang at intel.com>
Acked-by: Huawei Xie <huawei.xie at intel.com>
---
 lib/librte_pmd_virtio/virtio_ethdev.c |  20 +--
 lib/librte_pmd_virtio/virtio_ethdev.h |   3 +
 lib/librte_pmd_virtio/virtio_rxtx.c   | 221 +++++++++++++++++++++++++++++-----
 3 files changed, 207 insertions(+), 37 deletions(-)

diff --git a/lib/librte_pmd_virtio/virtio_ethdev.c b/lib/librte_pmd_virtio/virtio_ethdev.c
index b9f5529..535d798 100644
--- a/lib/librte_pmd_virtio/virtio_ethdev.c
+++ b/lib/librte_pmd_virtio/virtio_ethdev.c
@@ -337,7 +337,7 @@ int virtio_dev_queue_setup(struct rte_eth_dev *dev,
 		snprintf(vq_name, sizeof(vq_name), "port%d_tvq%d_hdrzone",
 			dev->data->port_id, queue_idx);
 		vq->virtio_net_hdr_mz = rte_memzone_reserve_aligned(vq_name,
-			vq_size * sizeof(struct virtio_net_hdr),
+			vq_size * hw->vtnet_hdr_size,
 			socket_id, 0, CACHE_LINE_SIZE);
 		if (vq->virtio_net_hdr_mz == NULL) {
 			rte_free(vq);
@@ -346,7 +346,7 @@ int virtio_dev_queue_setup(struct rte_eth_dev *dev,
 		vq->virtio_net_hdr_mem =
 			vq->virtio_net_hdr_mz->phys_addr;
 		memset(vq->virtio_net_hdr_mz->addr, 0,
-			vq_size * sizeof(struct virtio_net_hdr));
+			vq_size * hw->vtnet_hdr_size);
 	} else if (queue_type == VTNET_CQ) {
 		/* Allocate a page for control vq command, data and status */
 		snprintf(vq_name, sizeof(vq_name), "port%d_cvq_hdrzone",
@@ -571,9 +571,6 @@ virtio_negotiate_features(struct virtio_hw *hw)
 	mask |= VIRTIO_NET_F_GUEST_TSO4 | VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN;
 	mask |= VTNET_LRO_FEATURES;
 
-	/* rx_mbuf should not be in multiple merged segments */
-	mask |= VIRTIO_NET_F_MRG_RXBUF;
-
 	/* not negotiating INDIRECT descriptor table support */
 	mask |= VIRTIO_RING_F_INDIRECT_DESC;
 
@@ -746,7 +743,6 @@ eth_virtio_dev_init(__rte_unused struct eth_driver *eth_drv,
 	}
 
 	eth_dev->dev_ops = &virtio_eth_dev_ops;
-	eth_dev->rx_pkt_burst = &virtio_recv_pkts;
 	eth_dev->tx_pkt_burst = &virtio_xmit_pkts;
 
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
@@ -801,10 +797,13 @@ eth_virtio_dev_init(__rte_unused struct eth_driver *eth_drv,
 	virtio_negotiate_features(hw);
 
 	/* Setting up rx_header size for the device */
-	if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF))
+	if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
+		eth_dev->rx_pkt_burst = &virtio_recv_mergeable_pkts;
 		hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
-	else
+	} else {
+		eth_dev->rx_pkt_burst = &virtio_recv_pkts;
 		hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
+	}
 
 	/* Allocate memory for storing MAC addresses */
 	eth_dev->data->mac_addrs = rte_zmalloc("virtio", ETHER_ADDR_LEN, 0);
@@ -1009,7 +1008,7 @@ static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
 
 		while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
 					dev->data->rx_queues[i])) != NULL) {
-			rte_pktmbuf_free_seg(buf);
+			rte_pktmbuf_free(buf);
 			mbuf_num++;
 		}
 
@@ -1028,7 +1027,8 @@ static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
 		mbuf_num = 0;
 		while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
 					dev->data->tx_queues[i])) != NULL) {
-			rte_pktmbuf_free_seg(buf);
+			rte_pktmbuf_free(buf);
+
 			mbuf_num++;
 		}
 
diff --git a/lib/librte_pmd_virtio/virtio_ethdev.h b/lib/librte_pmd_virtio/virtio_ethdev.h
index 858e644..d2e1eed 100644
--- a/lib/librte_pmd_virtio/virtio_ethdev.h
+++ b/lib/librte_pmd_virtio/virtio_ethdev.h
@@ -104,6 +104,9 @@ int  virtio_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
 uint16_t virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts);
 
+uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts);
+
 uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
 
diff --git a/lib/librte_pmd_virtio/virtio_rxtx.c b/lib/librte_pmd_virtio/virtio_rxtx.c
index fcd8bd1..0b10108 100644
--- a/lib/librte_pmd_virtio/virtio_rxtx.c
+++ b/lib/librte_pmd_virtio/virtio_rxtx.c
@@ -146,6 +146,7 @@ static inline int
 virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
 {
 	struct vq_desc_extra *dxp;
+	struct virtio_hw *hw = vq->hw;
 	struct vring_desc *start_dp;
 	uint16_t needed = 1;
 	uint16_t head_idx, idx;
@@ -165,9 +166,11 @@ virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
 	dxp->ndescs = needed;
 
 	start_dp = vq->vq_ring.desc;
-	start_dp[idx].addr  =
-		(uint64_t) (cookie->buf_physaddr + RTE_PKTMBUF_HEADROOM - sizeof(struct virtio_net_hdr));
-	start_dp[idx].len   = cookie->buf_len - RTE_PKTMBUF_HEADROOM + sizeof(struct virtio_net_hdr);
+	start_dp[idx].addr =
+		(uint64_t)(cookie->buf_physaddr + RTE_PKTMBUF_HEADROOM
+		- hw->vtnet_hdr_size);
+	start_dp[idx].len =
+		cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
 	start_dp[idx].flags =  VRING_DESC_F_WRITE;
 	idx = start_dp[idx].next;
 	vq->vq_desc_head_idx = idx;
@@ -184,8 +187,10 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
 {
 	struct vq_desc_extra *dxp;
 	struct vring_desc *start_dp;
-	uint16_t needed = 2;
+	uint16_t seg_num = cookie->pkt.nb_segs;
+	uint16_t needed = 1 + seg_num;
 	uint16_t head_idx, idx;
+	uint16_t head_size = txvq->hw->vtnet_hdr_size;
 
 	if (unlikely(txvq->vq_free_cnt == 0))
 		return -ENOSPC;
@@ -198,19 +203,25 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
 	idx = head_idx;
 	dxp = &txvq->vq_descx[idx];
 	if (dxp->cookie != NULL)
-		rte_pktmbuf_free_seg(dxp->cookie);
+		rte_pktmbuf_free(dxp->cookie);
 	dxp->cookie = (void *)cookie;
 	dxp->ndescs = needed;
 
 	start_dp = txvq->vq_ring.desc;
-	start_dp[idx].addr  =
-		txvq->virtio_net_hdr_mem + idx * sizeof(struct virtio_net_hdr);
-	start_dp[idx].len   = sizeof(struct virtio_net_hdr);
+	start_dp[idx].addr =
+		txvq->virtio_net_hdr_mem + idx * head_size;
+	start_dp[idx].len = (uint32_t)head_size;
 	start_dp[idx].flags = VRING_DESC_F_NEXT;
-	idx = start_dp[idx].next;
-	start_dp[idx].addr  = RTE_MBUF_DATA_DMA_ADDR(cookie);
-	start_dp[idx].len   = cookie->pkt.data_len;
-	start_dp[idx].flags = 0;
+
+	for (; ((seg_num > 0) && (cookie != NULL)); seg_num--) {
+		idx = start_dp[idx].next;
+		start_dp[idx].addr  = RTE_MBUF_DATA_DMA_ADDR(cookie);
+		start_dp[idx].len   = cookie->pkt.data_len;
+		start_dp[idx].flags = VRING_DESC_F_NEXT;
+		cookie = cookie->pkt.next;
+	}
+
+	start_dp[idx].flags &= ~VRING_DESC_F_NEXT;
 	idx = start_dp[idx].next;
 	txvq->vq_desc_head_idx = idx;
 	if (txvq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
@@ -284,7 +295,7 @@ virtio_dev_vring_start(struct virtqueue *vq, int queue_type)
 			error = virtqueue_enqueue_recv_refill(vq, m);
 
 			if (error) {
-				rte_pktmbuf_free_seg(m);
+				rte_pktmbuf_free(m);
 				break;
 			}
 			nbufs++;
@@ -423,7 +434,7 @@ virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m)
 	error = virtqueue_enqueue_recv_refill(vq, m);
 	if (unlikely(error)) {
 		RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
-		rte_pktmbuf_free_seg(m);
+		rte_pktmbuf_free(m);
 	}
 }
 
@@ -433,13 +444,13 @@ uint16_t
 virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	struct virtqueue *rxvq = rx_queue;
-	struct virtio_hw *hw = rxvq->hw;
 	struct rte_mbuf *rxm, *new_mbuf;
 	uint16_t nb_used, num, nb_rx = 0;
 	uint32_t len[VIRTIO_MBUF_BURST_SZ];
 	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
 	int error;
 	uint32_t i, nb_enqueued = 0;
+	const uint32_t hdr_size = sizeof(struct virtio_net_hdr);
 
 	nb_used = VIRTQUEUE_NUSED(rxvq);
 
@@ -460,8 +471,7 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 		PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);
 
-		if (unlikely(len[i]
-			     < (uint32_t)hw->vtnet_hdr_size + ETHER_HDR_LEN)) {
+		if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) {
 			PMD_RX_LOG(ERR, "Packet drop");
 			nb_enqueued++;
 			virtio_discard_rxbuf(rxvq, rxm);
@@ -471,17 +481,16 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 		rxm->pkt.in_port = rxvq->port_id;
 		rxm->pkt.data = (char *)rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
+
 		rxm->pkt.nb_segs = 1;
 		rxm->pkt.next = NULL;
-		rxm->pkt.pkt_len  = (uint32_t)(len[i]
-					       - sizeof(struct virtio_net_hdr));
-		rxm->pkt.data_len = (uint16_t)(len[i]
-					       - sizeof(struct virtio_net_hdr));
+		rxm->pkt.pkt_len = (uint32_t)(len[i] - hdr_size);
+		rxm->pkt.data_len = (uint16_t)(len[i] - hdr_size);
 
 		VIRTIO_DUMP_PACKET(rxm, rxm->pkt.data_len);
 
 		rx_pkts[nb_rx++] = rxm;
-		rxvq->bytes += len[i] - sizeof(struct virtio_net_hdr);
+		rxvq->bytes += rx_pkts[nb_rx - 1]->pkt.pkt_len;
 	}
 
 	rxvq->packets += nb_rx;
@@ -498,11 +507,165 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		}
 		error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
 		if (unlikely(error)) {
-			rte_pktmbuf_free_seg(new_mbuf);
+			rte_pktmbuf_free(new_mbuf);
 			break;
 		}
 		nb_enqueued++;
 	}
+
+	if (likely(nb_enqueued)) {
+		if (unlikely(virtqueue_kick_prepare(rxvq))) {
+			virtqueue_notify(rxvq);
+			PMD_RX_LOG(DEBUG, "Notified\n");
+		}
+	}
+
+	vq_update_avail_idx(rxvq);
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_recv_mergeable_pkts(void *rx_queue,
+			struct rte_mbuf **rx_pkts,
+			uint16_t nb_pkts)
+{
+	struct virtqueue *rxvq = rx_queue;
+	struct rte_mbuf *rxm, *new_mbuf;
+	uint16_t nb_used, num, nb_rx = 0;
+	uint32_t len[VIRTIO_MBUF_BURST_SZ];
+	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
+	struct rte_mbuf *prev;
+	int error;
+	uint32_t i = 0, nb_enqueued = 0;
+	uint32_t seg_num = 0;
+	uint16_t extra_idx = 0;
+	uint32_t seg_res = 0;
+	const uint32_t hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+
+	nb_used = VIRTQUEUE_NUSED(rxvq);
+
+	rmb();
+
+	if (nb_used == 0)
+		return 0;
+
+	PMD_RX_LOG(DEBUG, "used:%d\n", nb_used);
+
+	while (i < nb_used) {
+		struct virtio_net_hdr_mrg_rxbuf *header;
+
+		if (nb_rx == nb_pkts)
+			break;
+
+		num = virtqueue_dequeue_burst_rx(rxvq, rcv_pkts, len, 1);
+		if (num != 1)
+			continue;
+
+		i++;
+
+		PMD_RX_LOG(DEBUG, "dequeue:%d\n", num);
+		PMD_RX_LOG(DEBUG, "packet len:%d\n", len[0]);
+
+		rxm = rcv_pkts[0];
+
+		if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) {
+			PMD_RX_LOG(ERR, "Packet drop\n");
+			nb_enqueued++;
+			virtio_discard_rxbuf(rxvq, rxm);
+			rxvq->errors++;
+			continue;
+		}
+
+		header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm->buf_addr +
+			RTE_PKTMBUF_HEADROOM - hdr_size);
+		seg_num = header->num_buffers;
+
+		if (seg_num == 0)
+			seg_num = 1;
+
+		rxm->pkt.data = (char *)rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
+		rxm->pkt.nb_segs = seg_num;
+		rxm->pkt.next = NULL;
+		rxm->pkt.pkt_len = (uint32_t)(len[0] - hdr_size);
+		rxm->pkt.data_len = (uint16_t)(len[0] - hdr_size);
+
+		rxm->pkt.in_port = rxvq->port_id;
+		rx_pkts[nb_rx] = rxm;
+		prev = rxm;
+
+		seg_res = seg_num - 1;
+
+		while (seg_res != 0) {
+			/*
+			 * Get extra segments for current uncompleted packet.
+			 */
+			uint32_t  rcv_cnt =
+				RTE_MIN(seg_res, RTE_DIM(rcv_pkts));
+			if (likely(VIRTQUEUE_NUSED(rxvq) >= rcv_cnt)) {
+				uint32_t rx_num =
+					virtqueue_dequeue_burst_rx(rxvq,
+					rcv_pkts, len, rcv_cnt);
+				i += rx_num;
+				rcv_cnt = rx_num;
+			} else {
+				PMD_RX_LOG(ERR,
+					"No enough segments for packet.\n");
+				nb_enqueued++;
+				virtio_discard_rxbuf(rxvq, rxm);
+				rxvq->errors++;
+				break;
+			}
+
+			extra_idx = 0;
+
+			while (extra_idx < rcv_cnt) {
+				rxm = rcv_pkts[extra_idx];
+
+				rxm->pkt.data =
+					(char *)rxm->buf_addr +
+					RTE_PKTMBUF_HEADROOM - hdr_size;
+				rxm->pkt.next = NULL;
+				rxm->pkt.pkt_len = (uint32_t)(len[extra_idx]);
+				rxm->pkt.data_len = (uint16_t)(len[extra_idx]);
+
+				if (prev)
+					prev->pkt.next = rxm;
+
+				prev = rxm;
+				rx_pkts[nb_rx]->pkt.pkt_len += rxm->pkt.pkt_len;
+				extra_idx++;
+			};
+			seg_res -= rcv_cnt;
+		}
+
+		VIRTIO_DUMP_PACKET(rx_pkts[nb_rx],
+			rx_pkts[nb_rx]->pkt.data_len);
+
+		rxvq->bytes += rx_pkts[nb_rx]->pkt.pkt_len;
+		nb_rx++;
+	}
+
+	rxvq->packets += nb_rx;
+
+	/* Allocate new mbuf for the used descriptor */
+	error = ENOSPC;
+	while (likely(!virtqueue_full(rxvq))) {
+		new_mbuf = rte_rxmbuf_alloc(rxvq->mpool);
+		if (unlikely(new_mbuf == NULL)) {
+			struct rte_eth_dev *dev
+				= &rte_eth_devices[rxvq->port_id];
+			dev->data->rx_mbuf_alloc_failed++;
+			break;
+		}
+		error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
+		if (unlikely(error)) {
+			rte_pktmbuf_free(new_mbuf);
+			break;
+		}
+		nb_enqueued++;
+	}
+
 	if (likely(nb_enqueued)) {
 		if (unlikely(virtqueue_kick_prepare(rxvq))) {
 			virtqueue_notify(rxvq);
@@ -536,12 +699,16 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	num = (uint16_t)(likely(nb_used < VIRTIO_MBUF_BURST_SZ) ? nb_used : VIRTIO_MBUF_BURST_SZ);
 
 	while (nb_tx < nb_pkts) {
-		if (virtqueue_full(txvq) && num) {
+		int need = tx_pkts[nb_tx]->pkt.nb_segs - txvq->vq_free_cnt;
+		int deq_cnt = RTE_MIN(need, (int)num);
+
+		num -= (deq_cnt > 0) ? deq_cnt : 0;
+		while (deq_cnt > 0) {
 			virtqueue_dequeue_pkt_tx(txvq);
-			num--;
+			deq_cnt--;
 		}
 
-		if (!virtqueue_full(txvq)) {
+		if (tx_pkts[nb_tx]->pkt.nb_segs <= txvq->vq_free_cnt) {
 			txm = tx_pkts[nb_tx];
 			/* Enqueue Packet buffers */
 			error = virtqueue_enqueue_xmit(txvq, txm);
@@ -555,7 +722,7 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 				break;
 			}
 			nb_tx++;
-			txvq->bytes += txm->pkt.data_len;
+			txvq->bytes += txm->pkt.pkt_len;
 		} else {
 			PMD_TX_LOG(ERR, "No free tx descriptors to transmit");
 			break;
-- 
1.8.4.2



More information about the dev mailing list