[dpdk-dev] [PATCH v2 1/7] vhost: refactor rte_vhost_dequeue_burst

Xie, Huawei huawei.xie at intel.com
Thu Mar 3 17:21:19 CET 2016


On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> The current rte_vhost_dequeue_burst() implementation is a bit messy
> and logic twisted. And you could see repeat code here and there: it
> invokes rte_pktmbuf_alloc() three times at three different places!
>
> However, rte_vhost_dequeue_burst() acutally does a simple job: copy
> the packet data from vring desc to mbuf. What's tricky here is:
>
> - desc buff could be chained (by desc->next field), so that you need
>   fetch next one if current is wholly drained.
>
> - One mbuf could not be big enough to hold all desc buff, hence you
>   need to chain the mbuf as well, by the mbuf->next field.
>
> Even though, the logic could be simple. Here is the pseudo code.
>
> 	while (this_desc_is_not_drained_totally || has_next_desc) {
> 		if (this_desc_has_drained_totally) {
> 			this_desc = next_desc();
> 		}
>
> 		if (mbuf_has_no_room) {
> 			mbuf = allocate_a_new_mbuf();
> 		}
>
> 		COPY(mbuf, desc);
> 	}
>
> And this is how I refactored rte_vhost_dequeue_burst.
>
> Note that the old patch does a special handling for skipping virtio
> header. However, that could be simply done by adjusting desc_avail
> and desc_offset var:
>
> 	desc_avail  = desc->len - vq->vhost_hlen;
> 	desc_offset = vq->vhost_hlen;
>
> This refactor makes the code much more readable (IMO), yet it reduces
> binary code size (nearly 2K).
>
> Signed-off-by: Yuanhan Liu <yuanhan.liu at linux.intel.com>
> ---
>
> v2: - fix potential NULL dereference bug of var "prev" and "head"
> ---
>  lib/librte_vhost/vhost_rxtx.c | 297 +++++++++++++++++-------------------------
>  1 file changed, 116 insertions(+), 181 deletions(-)
>
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 5e7e5b1..d5cd0fa 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -702,21 +702,104 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
>  	}
>  }
>  
> +static inline struct rte_mbuf *
> +copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		  uint16_t desc_idx, struct rte_mempool *mbuf_pool)
> +{
> +	struct vring_desc *desc;
> +	uint64_t desc_addr;
> +	uint32_t desc_avail, desc_offset;
> +	uint32_t mbuf_avail, mbuf_offset;
> +	uint32_t cpy_len;
> +	struct rte_mbuf *head = NULL;
> +	struct rte_mbuf *cur = NULL, *prev = NULL;
> +	struct virtio_net_hdr *hdr;
> +
> +	desc = &vq->desc[desc_idx];
> +	desc_addr = gpa_to_vva(dev, desc->addr);
> +	rte_prefetch0((void *)(uintptr_t)desc_addr);
> +
> +	/* Retrieve virtio net header */
> +	hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
> +	desc_avail  = desc->len - vq->vhost_hlen;

There is a serious bug here, desc->len - vq->vhost_len could overflow.
VM could easily create this case. Let us fix it here.

> +	desc_offset = vq->vhost_hlen;
> +
> +	mbuf_avail  = 0;
> +	mbuf_offset = 0;
> +	while (desc_avail || (desc->flags & VRING_DESC_F_NEXT) != 0) {
> +		/* This desc reachs to its end, get the next one */
> +		if (desc_avail == 0) {
> +			desc = &vq->desc[desc->next];
> +
> +			desc_addr = gpa_to_vva(dev, desc->addr);
> +			rte_prefetch0((void *)(uintptr_t)desc_addr);
> +
> +			desc_offset = 0;
> +			desc_avail  = desc->len;
> +
> +			PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
> +		}
> +
> +		/*
> +		 * This mbuf reachs to its end, get a new one
> +		 * to hold more data.
> +		 */
> +		if (mbuf_avail == 0) {
> +			cur = rte_pktmbuf_alloc(mbuf_pool);
> +			if (unlikely(!cur)) {
> +				RTE_LOG(ERR, VHOST_DATA, "Failed to "
> +					"allocate memory for mbuf.\n");
> +				if (head)
> +					rte_pktmbuf_free(head);
> +				return NULL;
> +			}
> +			if (!head) {
> +				head = cur;
> +			} else {
> +				prev->next = cur;
> +				prev->data_len = mbuf_offset;
> +				head->nb_segs += 1;
> +			}
> +			head->pkt_len += mbuf_offset;
> +			prev = cur;
> +
> +			mbuf_offset = 0;
> +			mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> +		}
> +
> +		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> +		rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
> +			(void *)((uintptr_t)(desc_addr + desc_offset)),
> +			cpy_len);
> +
> +		mbuf_avail  -= cpy_len;
> +		mbuf_offset += cpy_len;
> +		desc_avail  -= cpy_len;
> +		desc_offset += cpy_len;
> +	}
> +
> +	if (prev) {
> +		prev->data_len = mbuf_offset;
> +		head->pkt_len += mbuf_offset;
> +
> +		if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
> +			vhost_dequeue_offload(hdr, head);
> +	}
> +
> +	return head;
> +}
> +
>  uint16_t
>  rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>  	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
>  {
> -	struct rte_mbuf *m, *prev;
>  	struct vhost_virtqueue *vq;
> -	struct vring_desc *desc;
> -	uint64_t vb_addr = 0;
> -	uint64_t vb_net_hdr_addr = 0;
> -	uint32_t head[MAX_PKT_BURST];
> +	uint32_t desc_indexes[MAX_PKT_BURST];
>  	uint32_t used_idx;
>  	uint32_t i;
> -	uint16_t free_entries, entry_success = 0;
> +	uint16_t free_entries;
>  	uint16_t avail_idx;
> -	struct virtio_net_hdr *hdr = NULL;
> +	struct rte_mbuf *m;
>  
>  	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
>  		RTE_LOG(ERR, VHOST_DATA,
> @@ -730,197 +813,49 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>  		return 0;
>  
>  	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
> -
> -	/* If there are no available buffers then return. */
> -	if (vq->last_used_idx == avail_idx)
> +	free_entries = avail_idx - vq->last_used_idx;
> +	if (free_entries == 0)
>  		return 0;
>  
> -	LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
> -		dev->device_fh);
> +	LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, dev->device_fh);
>  
> -	/* Prefetch available ring to retrieve head indexes. */
> -	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
> +	used_idx = vq->last_used_idx & (vq->size -1);
>  
> -	/*get the number of free entries in the ring*/
> -	free_entries = (avail_idx - vq->last_used_idx);
> +	/* Prefetch available ring to retrieve head indexes. */
> +	rte_prefetch0(&vq->avail->ring[used_idx]);
>  
> -	free_entries = RTE_MIN(free_entries, count);
> -	/* Limit to MAX_PKT_BURST. */
> -	free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
> +	count = RTE_MIN(count, MAX_PKT_BURST);
> +	count = RTE_MIN(count, free_entries);
> +	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") about to dequeue %u buffers\n",
> +			dev->device_fh, count);
>  
> -	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
> -			dev->device_fh, free_entries);
>  	/* Retrieve all of the head indexes first to avoid caching issues. */
> -	for (i = 0; i < free_entries; i++)
> -		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
> +	for (i = 0; i < count; i++) {
> +		desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
> +					(vq->size - 1)];
> +	}
>  
>  	/* Prefetch descriptor index. */
> -	rte_prefetch0(&vq->desc[head[entry_success]]);
> +	rte_prefetch0(&vq->desc[desc_indexes[0]]);
>  	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
>  
> -	while (entry_success < free_entries) {
> -		uint32_t vb_avail, vb_offset;
> -		uint32_t seg_avail, seg_offset;
> -		uint32_t cpy_len;
> -		uint32_t seg_num = 0;
> -		struct rte_mbuf *cur;
> -		uint8_t alloc_err = 0;
> -
> -		desc = &vq->desc[head[entry_success]];
> -
> -		vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
> -		hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
> -
> -		/* Discard first buffer as it is the virtio header */
> -		if (desc->flags & VRING_DESC_F_NEXT) {
> -			desc = &vq->desc[desc->next];
> -			vb_offset = 0;
> -			vb_avail = desc->len;
> -		} else {
> -			vb_offset = vq->vhost_hlen;
> -			vb_avail = desc->len - vb_offset;
> -		}
> -
> -		/* Buffer address translation. */
> -		vb_addr = gpa_to_vva(dev, desc->addr);
> -		/* Prefetch buffer address. */
> -		rte_prefetch0((void *)(uintptr_t)vb_addr);
> -
> -		used_idx = vq->last_used_idx & (vq->size - 1);
> -
> -		if (entry_success < (free_entries - 1)) {
> -			/* Prefetch descriptor index. */
> -			rte_prefetch0(&vq->desc[head[entry_success+1]]);
> -			rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
> -		}
> -
> -		/* Update used index buffer information. */
> -		vq->used->ring[used_idx].id = head[entry_success];
> -		vq->used->ring[used_idx].len = 0;
> -
> -		/* Allocate an mbuf and populate the structure. */
> -		m = rte_pktmbuf_alloc(mbuf_pool);
> -		if (unlikely(m == NULL)) {
> -			RTE_LOG(ERR, VHOST_DATA,
> -				"Failed to allocate memory for mbuf.\n");
> -			break;
> -		}
> -		seg_offset = 0;
> -		seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
> -		cpy_len = RTE_MIN(vb_avail, seg_avail);
> -
> -		PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
> -
> -		seg_num++;
> -		cur = m;
> -		prev = m;
> -		while (cpy_len != 0) {
> -			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
> -				(void *)((uintptr_t)(vb_addr + vb_offset)),
> -				cpy_len);
> -
> -			seg_offset += cpy_len;
> -			vb_offset += cpy_len;
> -			vb_avail -= cpy_len;
> -			seg_avail -= cpy_len;
> -
> -			if (vb_avail != 0) {
> -				/*
> -				 * The segment reachs to its end,
> -				 * while the virtio buffer in TX vring has
> -				 * more data to be copied.
> -				 */
> -				cur->data_len = seg_offset;
> -				m->pkt_len += seg_offset;
> -				/* Allocate mbuf and populate the structure. */
> -				cur = rte_pktmbuf_alloc(mbuf_pool);
> -				if (unlikely(cur == NULL)) {
> -					RTE_LOG(ERR, VHOST_DATA, "Failed to "
> -						"allocate memory for mbuf.\n");
> -					rte_pktmbuf_free(m);
> -					alloc_err = 1;
> -					break;
> -				}
> -
> -				seg_num++;
> -				prev->next = cur;
> -				prev = cur;
> -				seg_offset = 0;
> -				seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> -			} else {
> -				if (desc->flags & VRING_DESC_F_NEXT) {
> -					/*
> -					 * There are more virtio buffers in
> -					 * same vring entry need to be copied.
> -					 */
> -					if (seg_avail == 0) {
> -						/*
> -						 * The current segment hasn't
> -						 * room to accomodate more
> -						 * data.
> -						 */
> -						cur->data_len = seg_offset;
> -						m->pkt_len += seg_offset;
> -						/*
> -						 * Allocate an mbuf and
> -						 * populate the structure.
> -						 */
> -						cur = rte_pktmbuf_alloc(mbuf_pool);
> -						if (unlikely(cur == NULL)) {
> -							RTE_LOG(ERR,
> -								VHOST_DATA,
> -								"Failed to "
> -								"allocate memory "
> -								"for mbuf\n");
> -							rte_pktmbuf_free(m);
> -							alloc_err = 1;
> -							break;
> -						}
> -						seg_num++;
> -						prev->next = cur;
> -						prev = cur;
> -						seg_offset = 0;
> -						seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> -					}
> -
> -					desc = &vq->desc[desc->next];
> -
> -					/* Buffer address translation. */
> -					vb_addr = gpa_to_vva(dev, desc->addr);
> -					/* Prefetch buffer address. */
> -					rte_prefetch0((void *)(uintptr_t)vb_addr);
> -					vb_offset = 0;
> -					vb_avail = desc->len;
> -
> -					PRINT_PACKET(dev, (uintptr_t)vb_addr,
> -						desc->len, 0);
> -				} else {
> -					/* The whole packet completes. */
> -					cur->data_len = seg_offset;
> -					m->pkt_len += seg_offset;
> -					vb_avail = 0;
> -				}
> -			}
> -
> -			cpy_len = RTE_MIN(vb_avail, seg_avail);
> -		}
> -
> -		if (unlikely(alloc_err == 1))
> +	for (i = 0; i < count; i++) {
> +		m = copy_desc_to_mbuf(dev, vq, desc_indexes[i], mbuf_pool);
> +		if (m == NULL)
>  			break;
> +		pkts[i] = m;
>  
> -		m->nb_segs = seg_num;
> -		if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
> -			vhost_dequeue_offload(hdr, m);
> -
> -		pkts[entry_success] = m;
> -		vq->last_used_idx++;
> -		entry_success++;
> +		used_idx = vq->last_used_idx++ & (vq->size - 1);
> +		vq->used->ring[used_idx].id  = desc_indexes[i];
> +		vq->used->ring[used_idx].len = 0;
>  	}
>  
>  	rte_compiler_barrier();
> -	vq->used->idx += entry_success;
> +	vq->used->idx += i;
> +
>  	/* Kick guest if required. */
>  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
>  		eventfd_write(vq->callfd, (eventfd_t)1);
> -	return entry_success;
> +
> +	return i;
>  }



More information about the dev mailing list