[dpdk-dev] [PATCH 2/2] net/virtio: support GUEST ANNOUNCE

Tiwei Bie tiwei.bie at intel.com
Fri Nov 24 07:04:32 CET 2017


Hi,

Some quick comments. Will go through the whole patch later.

On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote:
> When live migration is done, for the backup VM, either the virtio
> frontend or the vhost backend needs to send out gratuitous RARP packet
> to announce its new network location.
> 
> This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support live
> migration scenario where the vhost backend doesn't have the ability to
> generate RARP packet.
> 
> Brief introduction of the work flow:
> 1. QEMU finishes live migration, pokes the backup VM with an interrupt.
> 2. Virtio interrupt handler reads out the interrupt status value, and
>    realizes it needs to send out RARP packet to announce its location.
> 3. Pause device to stop worker thread touching the queues.
> 4. Inject a RARP packet into a Tx Queue.
> 5. Ack the interrupt via control queue.
> 6. Resume device to continue packet processing.
> 
> Signed-off-by: Xiao Wang <xiao.w.wang at intel.com>
> ---
>  drivers/net/virtio/virtio_ethdev.c | 131 ++++++++++++++++++++++++++++++++++++-
>  drivers/net/virtio/virtio_ethdev.h |   4 ++
>  drivers/net/virtio/virtio_pci.h    |   1 +
>  drivers/net/virtio/virtio_rxtx.c   |  81 +++++++++++++++++++++++
>  drivers/net/virtio/virtqueue.h     |  11 ++++
>  5 files changed, 226 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
> index 1959b11..6eaea0e 100644
> --- a/drivers/net/virtio/virtio_ethdev.c
> +++ b/drivers/net/virtio/virtio_ethdev.c
> @@ -48,6 +48,8 @@
>  #include <rte_pci.h>
>  #include <rte_bus_pci.h>
>  #include <rte_ether.h>
> +#include <rte_ip.h>
> +#include <rte_arp.h>
>  #include <rte_common.h>
>  #include <rte_errno.h>
>  #include <rte_cpuflags.h>
> @@ -55,6 +57,7 @@
>  #include <rte_memory.h>
>  #include <rte_eal.h>
>  #include <rte_dev.h>
> +#include <rte_cycles.h>
>  
>  #include "virtio_ethdev.h"
>  #include "virtio_pci.h"
> @@ -106,6 +109,13 @@ static int virtio_dev_queue_stats_mapping_set(
>  	uint8_t stat_idx,
>  	uint8_t is_rx);
>  
> +static int make_rarp_packet(struct rte_mbuf *rarp_mbuf,
> +		const struct ether_addr *mac);
> +static int virtio_dev_pause(struct rte_eth_dev *dev);
> +static void virtio_dev_resume(struct rte_eth_dev *dev);
> +static void generate_rarp(struct rte_eth_dev *dev);
> +static void virtnet_ack_link_announce(struct rte_eth_dev *dev);
> +
>  /*
>   * The set of PCI devices this driver supports
>   */
> @@ -1249,9 +1259,116 @@ static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev,
>  	return 0;
>  }
>  
> +#define RARP_PKT_SIZE	64
> +
> +static int
> +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
> +{
> +	struct ether_hdr *eth_hdr;
> +	struct arp_hdr  *rarp;
> +
> +	if (rarp_mbuf->buf_len < RARP_PKT_SIZE) {
> +		PMD_DRV_LOG(ERR, "mbuf size too small %u (< %d)",
> +				rarp_mbuf->buf_len, RARP_PKT_SIZE);
> +		return -1;
> +	}
> +
> +	/* Ethernet header. */
> +	eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);

You can use rte_pktmbuf_mtod() directly.

> +	memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
> +	ether_addr_copy(mac, &eth_hdr->s_addr);
> +	eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
> +
> +	/* RARP header. */
> +	rarp = (struct arp_hdr *)(eth_hdr + 1);
> +	rarp->arp_hrd = htons(ARP_HRD_ETHER);
> +	rarp->arp_pro = htons(ETHER_TYPE_IPv4);
> +	rarp->arp_hln = ETHER_ADDR_LEN;
> +	rarp->arp_pln = 4;
> +	rarp->arp_op  = htons(ARP_OP_REVREQUEST);
> +
> +	ether_addr_copy(mac, &rarp->arp_data.arp_sha);
> +	ether_addr_copy(mac, &rarp->arp_data.arp_tha);
> +	memset(&rarp->arp_data.arp_sip, 0x00, 4);
> +	memset(&rarp->arp_data.arp_tip, 0x00, 4);
> +
> +	rarp_mbuf->data_len = RARP_PKT_SIZE;
> +	rarp_mbuf->pkt_len = RARP_PKT_SIZE;
> +
> +	return 0;
> +}
> +
> +static int
> +virtio_dev_pause(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +
> +	if (hw->started == 0)
> +		return -1;
> +	hw->started = 0;
> +	/*
> +	 * Prevent the worker thread from touching queues to avoid condition,
> +	 * 1 ms should be enough for the ongoing Tx function to finish.
> +	 */
> +	rte_delay_ms(1);
> +	return 0;
> +}
> +
> +static void
> +virtio_dev_resume(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +
> +	hw->started = 1;
> +}
> +
> +static void
> +generate_rarp(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +	struct rte_mbuf *rarp_mbuf = NULL;
> +	struct virtnet_tx *txvq = dev->data->tx_queues[0];
> +	struct virtnet_rx *rxvq = dev->data->rx_queues[0];
> +
> +	rarp_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
> +	if (rarp_mbuf == NULL) {
> +		PMD_DRV_LOG(ERR, "mbuf allocate failed");
> +		return;
> +	}
> +
> +	if (make_rarp_packet(rarp_mbuf, (struct ether_addr *)hw->mac_addr)) {
> +		rte_pktmbuf_free(rarp_mbuf);
> +		rarp_mbuf = NULL;
> +		return;
> +	}
> +
> +	/* If virtio port just stopped, no need to send RARP */
> +	if (virtio_dev_pause(dev) < -1)

You mean < 0?

> +		return;
> +
> +	virtio_inject_pkts(txvq, &rarp_mbuf, 1);
> +	/* Recover the stored hw status to let worker thread continue */
> +	virtio_dev_resume(dev);
> +}
> +
> +static void
> +virtnet_ack_link_announce(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +	struct virtio_pmd_ctrl ctrl;
> +	int len;
> +
> +	ctrl.hdr.class = VIRTIO_NET_CTRL_ANNOUNCE;
> +	ctrl.hdr.cmd = VIRTIO_NET_CTRL_ANNOUNCE_ACK;
> +	len = 0;
> +
> +	virtio_send_command(hw->cvq, &ctrl, &len, 0);
> +}
> +
>  /*
> - * Process Virtio Config changed interrupt and call the callback
> - * if link state changed.
> + * Process virtio config changed interrupt. Call the callback
> + * if link state changed; generate gratuitous RARP packet if
> + * the status indicates an ANNOUNCE.
>   */
>  void
>  virtio_interrupt_handler(void *param)
> @@ -1274,6 +1391,12 @@ static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev,
>  						      NULL, NULL);
>  	}
>  
> +	if (isr & VIRTIO_NET_S_ANNOUNCE) {
> +		rte_spinlock_lock(&hw->sl);
> +		generate_rarp(dev);
> +		virtnet_ack_link_announce(dev);
> +		rte_spinlock_unlock(&hw->sl);
> +	}
>  }
>  
>  /* set rx and tx handlers according to what is supported */
> @@ -1786,6 +1909,8 @@ static int eth_virtio_pci_remove(struct rte_pci_device *pci_dev)
>  			return -EBUSY;
>  		}
>  
> +	rte_spinlock_init(&hw->sl);
> +
>  	hw->use_simple_rx = 1;
>  	hw->use_simple_tx = 1;
>  
> @@ -1952,12 +2077,14 @@ static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
>  
>  	PMD_INIT_LOG(DEBUG, "stop");
>  
> +	rte_spinlock_lock(&hw->sl);
>  	if (intr_conf->lsc || intr_conf->rxq)
>  		virtio_intr_disable(dev);
>  
>  	hw->started = 0;
>  	memset(&link, 0, sizeof(link));
>  	virtio_dev_atomic_write_link_status(dev, &link);
> +	rte_spinlock_unlock(&hw->sl);
>  }
>  
>  static int
> diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h
> index 2039bc5..24271cb 100644
> --- a/drivers/net/virtio/virtio_ethdev.h
> +++ b/drivers/net/virtio/virtio_ethdev.h
> @@ -67,6 +67,7 @@
>  	 1u << VIRTIO_NET_F_HOST_TSO6	  |	\
>  	 1u << VIRTIO_NET_F_MRG_RXBUF	  |	\
>  	 1u << VIRTIO_NET_F_MTU	| \
> +	 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE  | \
>  	 1u << VIRTIO_RING_F_INDIRECT_DESC |    \
>  	 1ULL << VIRTIO_F_VERSION_1       |	\
>  	 1ULL << VIRTIO_F_IOMMU_PLATFORM)
> @@ -111,6 +112,9 @@ uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  		uint16_t nb_pkts);
>  
> +uint16_t virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		uint16_t nb_pkts);
> +
>  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		uint16_t nb_pkts);
>  
> diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
> index 3c5ce66..3cd367e 100644
> --- a/drivers/net/virtio/virtio_pci.h
> +++ b/drivers/net/virtio/virtio_pci.h
> @@ -270,6 +270,7 @@ struct virtio_hw {
>  	struct virtio_pci_common_cfg *common_cfg;
>  	struct virtio_net_config *dev_cfg;
>  	void	    *virtio_user_dev;
> +	rte_spinlock_t sl;

Need to add some detailed comments to describe what's
protected by this lock.

>  
>  	struct virtqueue **vqs;
>  };
> diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> index 6a24fde..7313bdd 100644
> --- a/drivers/net/virtio/virtio_rxtx.c
> +++ b/drivers/net/virtio/virtio_rxtx.c
> @@ -1100,3 +1100,84 @@
>  
>  	return nb_tx;
>  }
> +
> +uint16_t
> +virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> +{
> +	struct virtnet_tx *txvq = tx_queue;
> +	struct virtqueue *vq = txvq->vq;
> +	struct virtio_hw *hw = vq->hw;
> +	uint16_t hdr_size = hw->vtnet_hdr_size;
> +	uint16_t nb_used, nb_tx = 0;
> +
> +	if (unlikely(nb_pkts < 1))
> +		return nb_pkts;
> +
> +	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
> +	nb_used = VIRTQUEUE_NUSED(vq);
> +
> +	virtio_rmb();
> +	if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
> +		virtio_xmit_cleanup(vq, nb_used);
> +
> +	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
> +		struct rte_mbuf *txm = tx_pkts[nb_tx];
> +		int can_push = 0, use_indirect = 0, slots, need;
> +
> +		/* optimize ring usage */
> +		if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
> +					vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
> +			rte_mbuf_refcnt_read(txm) == 1 &&
> +			RTE_MBUF_DIRECT(txm) &&
> +			txm->nb_segs == 1 &&
> +			rte_pktmbuf_headroom(txm) >= hdr_size &&
> +			rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> +				__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> +			can_push = 1;
> +		else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> +			 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> +			use_indirect = 1;
> +
> +		/* How many main ring entries are needed to this Tx?
> +		 * any_layout => number of segments
> +		 * indirect   => 1
> +		 * default    => number of segments + 1
> +		 */
> +		slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
> +		need = slots - vq->vq_free_cnt;
> +
> +		/* Positive value indicates it need free vring descriptors */
> +		if (unlikely(need > 0)) {
> +			nb_used = VIRTQUEUE_NUSED(vq);
> +			virtio_rmb();
> +			need = RTE_MIN(need, (int)nb_used);
> +
> +			virtio_xmit_cleanup(vq, need);
> +			need = slots - vq->vq_free_cnt;
> +			if (unlikely(need > 0)) {
> +				PMD_TX_LOG(ERR,
> +						"No free tx descriptors to transmit");
> +				break;
> +			}
> +		}
> +
> +		/* Enqueue Packet buffers */
> +		virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push);
> +
> +		txvq->stats.bytes += txm->pkt_len;
> +		virtio_update_packet_stats(&txvq->stats, txm);
> +	}
> +
> +	txvq->stats.packets += nb_tx;
> +
> +	if (likely(nb_tx)) {
> +		vq_update_avail_idx(vq);
> +
> +		if (unlikely(virtqueue_kick_prepare(vq))) {
> +			virtqueue_notify(vq);
> +			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
> +		}
> +	}
> +
> +	return nb_tx;
> +}

What's the difference between virtio_inject_pkts() and
virtio_xmit_pkts() except the latter will check hw->started?

Best regards,
Tiwei Bie


More information about the dev mailing list