[dpdk-dev] [PATCH] vhost: broadcast RARP pkt by injecting it to receiving mbuf array

Qiu, Michael michael.qiu at intel.com
Wed Feb 24 09:15:36 CET 2016


On 2/22/2016 10:35 PM, Yuanhan Liu wrote:
> Broadcast RARP packet by injecting it to receiving mbuf array at
> rte_vhost_dequeue_burst().
>
> Commit 33226236a35e ("vhost: handle request to send RARP") iterates
> all host interfaces and then broadcast it by all of them.  It did
> notify the switches about the new location of the migrated VM, however,
> the mac learning table in the target host is wrong (at least in my
> test with OVS):
>
>     $ ovs-appctl fdb/show ovsbr0
>      port  VLAN  MAC                Age
>         1     0  b6:3c:72:71:cd:4d   10
>     LOCAL     0  b6:3c:72:71:cd:4e   10
>     LOCAL     0  52:54:00:12:34:68    9
>         1     0  56:f6:64:2c:bc:c0    1
>
> Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the
> above, the port learned is "LOCAL", which is the "ovsbr0" port. That
> is reasonable, since we indeed send the pkt by the "ovsbr0" interface.
>
> The wrong mac table lead all the packets to the VM go to the "ovsbr0"
> in the end, which ends up with all packets being lost, until the guest
> send a ARP quest (or reply) to refresh the mac learning table.
>
> Jianfeng then came up with a solution I have thought of firstly but NAKed

Is it suitable to mention someone in the commit log?

Thanks,
Michael
> by myself, concerning it has potential issues [0]. The solution is as title
> stated: broadcast the RARP packet by injecting it to the receiving mbuf
> arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me
> think it twice; it looked like a false concern to me then. And I had done
> a rough verification: it worked as expected.
>
> [0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html
>
> Another note is that while preparing this version, I found that DPDK has
> some ARP related structures and macros defined. So, use them instead of
> the one from standard header files here.
>
> Cc: Thibaut Collet <thibaut.collet at 6wind.com>
> Suggested-by: Jianfeng Tan <jianfeng.tan at intel.com>
> Signed-off-by: Yuanhan Liu <yuanhan.liu at linux.intel.com>
> ---
>  lib/librte_vhost/rte_virtio_net.h             |   5 +-
>  lib/librte_vhost/vhost_rxtx.c                 |  80 +++++++++++++++-
>  lib/librte_vhost/vhost_user/vhost-net-user.c  |   2 +-
>  lib/librte_vhost/vhost_user/virtio-net-user.c | 128 ++++----------------------
>  lib/librte_vhost/vhost_user/virtio-net-user.h |   2 +-
>  5 files changed, 104 insertions(+), 113 deletions(-)
>
> diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
> index 4a2303a..7d1fde2 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -49,6 +49,7 @@
>  
>  #include <rte_memory.h>
>  #include <rte_mempool.h>
> +#include <rte_ether.h>
>  
>  struct rte_mbuf;
>  
> @@ -133,7 +134,9 @@ struct virtio_net {
>  	void			*priv;		/**< private context */
>  	uint64_t		log_size;	/**< Size of log area */
>  	uint64_t		log_base;	/**< Where dirty pages are logged */
> -	uint64_t		reserved[62];	/**< Reserve some spaces for future extension. */
> +	struct ether_addr	mac;		/**< MAC address */
> +	rte_atomic16_t		broadcast_rarp;	/**< A flag to tell if we need broadcast rarp packet */
> +	uint64_t		reserved[61];	/**< Reserve some spaces for future extension. */
>  	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];	/**< Contains all virtqueue information. */
>  } __rte_cache_aligned;
>  
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 12ce0cc..9d23eb1 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -43,6 +43,7 @@
>  #include <rte_tcp.h>
>  #include <rte_udp.h>
>  #include <rte_sctp.h>
> +#include <rte_arp.h>
>  
>  #include "vhost-net.h"
>  
> @@ -761,11 +762,50 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
>  	}
>  }
>  
> +#define RARP_PKT_SIZE	64
> +
> +static int
> +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
> +{
> +	struct ether_hdr *eth_hdr;
> +	struct arp_hdr  *rarp;
> +
> +	if (rarp_mbuf->buf_len < 64) {
> +		RTE_LOG(WARNING, VHOST_DATA,
> +			"failed to make RARP; mbuf size too small %u (< %d)\n",
> +			rarp_mbuf->buf_len, RARP_PKT_SIZE);
> +		return -1;
> +	}
> +
> +	/* Ethernet header. */
> +	eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
> +	memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
> +	ether_addr_copy(mac, &eth_hdr->s_addr);
> +	eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
> +
> +	/* RARP header. */
> +	rarp = (struct arp_hdr *)(eth_hdr + 1);
> +	rarp->arp_hrd = htons(ARP_HRD_ETHER);
> +	rarp->arp_pro = htons(ETHER_TYPE_IPv4);
> +	rarp->arp_hln = ETHER_ADDR_LEN;
> +	rarp->arp_pln = 4;
> +	rarp->arp_op  = htons(ARP_OP_REVREQUEST);
> +
> +	ether_addr_copy(mac, &rarp->arp_data.arp_sha);
> +	ether_addr_copy(mac, &rarp->arp_data.arp_tha);
> +	memset(&rarp->arp_data.arp_sip, 0x00, 4);
> +	memset(&rarp->arp_data.arp_tip, 0x00, 4);
> +
> +	rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
> +
> +	return 0;
> +}
> +
>  uint16_t
>  rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>  	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
>  {
> -	struct rte_mbuf *m, *prev;
> +	struct rte_mbuf *m, *prev, *rarp_mbuf = NULL;
>  	struct vhost_virtqueue *vq;
>  	struct vring_desc *desc;
>  	uint64_t vb_addr = 0;
> @@ -788,11 +828,34 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>  	if (unlikely(vq->enabled == 0))
>  		return 0;
>  
> +	/*
> +	 * Construct a RARP broadcast packet, and inject it to the "pkts"
> +	 * array, to looks like that guest actually send such packet.
> +	 *
> +	 * Check user_send_rarp() for more information.
> +	 */
> +	if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
> +					 &dev->broadcast_rarp.cnt, 1, 0))) {
> +		rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
> +		if (rarp_mbuf == NULL) {
> +			RTE_LOG(ERR, VHOST_DATA,
> +				"Failed to allocate memory for mbuf.\n");
> +			return 0;
> +		}
> +
> +		if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
> +			rte_pktmbuf_free(rarp_mbuf);
> +			rarp_mbuf = NULL;
> +		} else {
> +			count -= 1;
> +		}
> +	}
> +
>  	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
>  
>  	/* If there are no available buffers then return. */
>  	if (vq->last_used_idx == avail_idx)
> -		return 0;
> +		goto out;
>  
>  	LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
>  		dev->device_fh);
> @@ -983,8 +1046,21 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>  	vq->used->idx += entry_success;
>  	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
>  			sizeof(vq->used->idx));
> +
>  	/* Kick guest if required. */
>  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
>  		eventfd_write(vq->callfd, (eventfd_t)1);
> +
> +out:
> +	if (unlikely(rarp_mbuf != NULL)) {
> +		/*
> +		 * Inject it to the head of "pkts" array, so that switch's mac
> +		 * learning table will get updated first.
> +		 */
> +		memmove(&pkts[1], pkts, entry_success * sizeof(m));
> +		pkts[0] = rarp_mbuf;
> +		entry_success += 1;
> +	}
> +
>  	return entry_success;
>  }
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
> index de7eecb..df2bd64 100644
> --- a/lib/librte_vhost/vhost_user/vhost-net-user.c
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
> @@ -437,7 +437,7 @@ vserver_message_handler(int connfd, void *dat, int *remove)
>  		user_set_vring_enable(ctx, &msg.payload.state);
>  		break;
>  	case VHOST_USER_SEND_RARP:
> -		user_send_rarp(&msg);
> +		user_send_rarp(ctx, &msg);
>  		break;
>  
>  	default:
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
> index 68b24f4..65b5652 100644
> --- a/lib/librte_vhost/vhost_user/virtio-net-user.c
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -39,12 +39,6 @@
>  #include <sys/mman.h>
>  #include <sys/types.h>
>  #include <sys/stat.h>
> -#include <sys/ioctl.h>
> -#include <sys/socket.h>
> -#include <net/ethernet.h>
> -#include <netinet/in.h>
> -#include <netinet/if_ether.h>
> -#include <linux/if_packet.h>
>  
>  #include <rte_common.h>
>  #include <rte_log.h>
> @@ -415,120 +409,38 @@ user_set_log_base(struct vhost_device_ctx ctx,
>  	return 0;
>  }
>  
> -#define RARP_BUF_SIZE	64
> -
> -static void
> -make_rarp_packet(uint8_t *buf, uint8_t *mac)
> -{
> -	struct ether_header *eth_hdr;
> -	struct ether_arp *rarp;
> -
> -	/* Ethernet header. */
> -	eth_hdr = (struct ether_header *)buf;
> -	memset(&eth_hdr->ether_dhost, 0xff, ETH_ALEN);
> -	memcpy(&eth_hdr->ether_shost, mac,  ETH_ALEN);
> -	eth_hdr->ether_type = htons(ETH_P_RARP);
> -
> -	/* RARP header. */
> -	rarp = (struct ether_arp *)(eth_hdr + 1);
> -	rarp->ea_hdr.ar_hrd = htons(ARPHRD_ETHER);
> -	rarp->ea_hdr.ar_pro = htons(ETHERTYPE_IP);
> -	rarp->ea_hdr.ar_hln = ETH_ALEN;
> -	rarp->ea_hdr.ar_pln = 4;
> -	rarp->ea_hdr.ar_op  = htons(ARPOP_RREQUEST);
> -
> -	memcpy(&rarp->arp_sha, mac, ETH_ALEN);
> -	memset(&rarp->arp_spa, 0x00, 4);
> -	memcpy(&rarp->arp_tha, mac, 6);
> -	memset(&rarp->arp_tpa, 0x00, 4);
> -}
> -
> -
> -static void
> -send_rarp(const char *ifname, uint8_t *rarp)
> -{
> -	int fd;
> -	struct ifreq ifr;
> -	struct sockaddr_ll addr;
> -
> -	fd = socket(AF_PACKET, SOCK_RAW, 0);
> -	if (fd < 0) {
> -		perror("socket failed");
> -		return;
> -	}
> -
> -	memset(&ifr, 0, sizeof(struct ifreq));
> -	strncpy(ifr.ifr_name, ifname, IFNAMSIZ);
> -	if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0) {
> -		perror("failed to get interface index");
> -		close(fd);
> -		return;
> -	}
> -
> -	addr.sll_ifindex = ifr.ifr_ifindex;
> -	addr.sll_halen   = ETH_ALEN;
> -
> -	if (sendto(fd, rarp, RARP_BUF_SIZE, 0,
> -		   (const struct sockaddr*)&addr, sizeof(addr)) < 0) {
> -		perror("send rarp packet failed");
> -	}
> -}
> -
> -
>  /*
> - * Broadcast a RARP message to all interfaces, to update
> - * switch's mac table
> + * An rarp packet is constructed and broadcasted to notify switches about
> + * the new location of the migrated VM, so that packets from outside will
> + * not be lost after migration.
> + *
> + * However, we don't actually "send" a rarp packet here, instead, we set
> + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
>   */
>  int
> -user_send_rarp(struct VhostUserMsg *msg)
> +user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *msg)
>  {
> +	struct virtio_net *dev;
>  	uint8_t *mac = (uint8_t *)&msg->payload.u64;
> -	uint8_t rarp[RARP_BUF_SIZE];
> -	struct ifconf ifc = {0, };
> -	struct ifreq *ifr;
> -	int nr = 16;
> -	int fd;
> -	uint32_t i;
> +
> +	dev = get_device(ctx);
> +	if (!dev)
> +		return -1;
>  
>  	RTE_LOG(DEBUG, VHOST_CONFIG,
>  		":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
>  		mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
> -
> -	make_rarp_packet(rarp, mac);
> +	memcpy(dev->mac.addr_bytes, mac, 6);
>  
>  	/*
> -	 * Get all interfaces
> +	 * Set the flag to inject a RARP broadcast packet at
> +	 * rte_vhost_dequeue_burst().
> +	 *
> +	 * rte_smp_wmb() is for making sure the mac is copied
> +	 * before the flag is set.
>  	 */
> -	fd = socket(AF_INET, SOCK_DGRAM, 0);
> -	if (fd < 0) {
> -		perror("failed to create AF_INET socket");
> -		return -1;
> -	}
> -
> -again:
> -	ifc.ifc_len = sizeof(*ifr) * nr;
> -	ifc.ifc_buf = realloc(ifc.ifc_buf, ifc.ifc_len);
> -
> -	if (ioctl(fd, SIOCGIFCONF, &ifc) < 0) {
> -		perror("failed at SIOCGIFCONF");
> -		close(fd);
> -		return -1;
> -	}
> -
> -	if (ifc.ifc_len == (int)sizeof(struct ifreq) * nr) {
> -		/*
> -		 * current ifc_buf is not big enough to hold
> -		 * all interfaces; double it and try again.
> -		 */
> -		nr *= 2;
> -		goto again;
> -	}
> -
> -	ifr = (struct ifreq *)ifc.ifc_buf;
> -	for (i = 0; i < ifc.ifc_len / sizeof(struct ifreq); i++)
> -		send_rarp(ifr[i].ifr_name, rarp);
> -
> -	close(fd);
> +	rte_smp_wmb();
> +	rte_atomic16_set(&dev->broadcast_rarp, 1);
>  
>  	return 0;
>  }
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h
> index 559bb46..cefec16 100644
> --- a/lib/librte_vhost/vhost_user/virtio-net-user.h
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h
> @@ -54,7 +54,7 @@ void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
>  void user_set_protocol_features(struct vhost_device_ctx ctx,
>  				uint64_t protocol_features);
>  int user_set_log_base(struct vhost_device_ctx ctx, struct VhostUserMsg *);
> -int user_send_rarp(struct VhostUserMsg *);
> +int user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *);
>  
>  int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
>  



More information about the dev mailing list