[dpdk-dev] [PATCH] vhost: broadcast RARP pkt by injecting it to receiving mbuf array
Qiu, Michael
michael.qiu at intel.com
Wed Feb 24 09:15:36 CET 2016
On 2/22/2016 10:35 PM, Yuanhan Liu wrote:
> Broadcast RARP packet by injecting it to receiving mbuf array at
> rte_vhost_dequeue_burst().
>
> Commit 33226236a35e ("vhost: handle request to send RARP") iterates
> all host interfaces and then broadcast it by all of them. It did
> notify the switches about the new location of the migrated VM, however,
> the mac learning table in the target host is wrong (at least in my
> test with OVS):
>
> $ ovs-appctl fdb/show ovsbr0
> port VLAN MAC Age
> 1 0 b6:3c:72:71:cd:4d 10
> LOCAL 0 b6:3c:72:71:cd:4e 10
> LOCAL 0 52:54:00:12:34:68 9
> 1 0 56:f6:64:2c:bc:c0 1
>
> Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the
> above, the port learned is "LOCAL", which is the "ovsbr0" port. That
> is reasonable, since we indeed send the pkt by the "ovsbr0" interface.
>
> The wrong mac table lead all the packets to the VM go to the "ovsbr0"
> in the end, which ends up with all packets being lost, until the guest
> send a ARP quest (or reply) to refresh the mac learning table.
>
> Jianfeng then came up with a solution I have thought of firstly but NAKed
Is it suitable to mention someone in the commit log?
Thanks,
Michael
> by myself, concerning it has potential issues [0]. The solution is as title
> stated: broadcast the RARP packet by injecting it to the receiving mbuf
> arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me
> think it twice; it looked like a false concern to me then. And I had done
> a rough verification: it worked as expected.
>
> [0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html
>
> Another note is that while preparing this version, I found that DPDK has
> some ARP related structures and macros defined. So, use them instead of
> the one from standard header files here.
>
> Cc: Thibaut Collet <thibaut.collet at 6wind.com>
> Suggested-by: Jianfeng Tan <jianfeng.tan at intel.com>
> Signed-off-by: Yuanhan Liu <yuanhan.liu at linux.intel.com>
> ---
> lib/librte_vhost/rte_virtio_net.h | 5 +-
> lib/librte_vhost/vhost_rxtx.c | 80 +++++++++++++++-
> lib/librte_vhost/vhost_user/vhost-net-user.c | 2 +-
> lib/librte_vhost/vhost_user/virtio-net-user.c | 128 ++++----------------------
> lib/librte_vhost/vhost_user/virtio-net-user.h | 2 +-
> 5 files changed, 104 insertions(+), 113 deletions(-)
>
> diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
> index 4a2303a..7d1fde2 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -49,6 +49,7 @@
>
> #include <rte_memory.h>
> #include <rte_mempool.h>
> +#include <rte_ether.h>
>
> struct rte_mbuf;
>
> @@ -133,7 +134,9 @@ struct virtio_net {
> void *priv; /**< private context */
> uint64_t log_size; /**< Size of log area */
> uint64_t log_base; /**< Where dirty pages are logged */
> - uint64_t reserved[62]; /**< Reserve some spaces for future extension. */
> + struct ether_addr mac; /**< MAC address */
> + rte_atomic16_t broadcast_rarp; /**< A flag to tell if we need broadcast rarp packet */
> + uint64_t reserved[61]; /**< Reserve some spaces for future extension. */
> struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; /**< Contains all virtqueue information. */
> } __rte_cache_aligned;
>
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 12ce0cc..9d23eb1 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -43,6 +43,7 @@
> #include <rte_tcp.h>
> #include <rte_udp.h>
> #include <rte_sctp.h>
> +#include <rte_arp.h>
>
> #include "vhost-net.h"
>
> @@ -761,11 +762,50 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
> }
> }
>
> +#define RARP_PKT_SIZE 64
> +
> +static int
> +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
> +{
> + struct ether_hdr *eth_hdr;
> + struct arp_hdr *rarp;
> +
> + if (rarp_mbuf->buf_len < 64) {
> + RTE_LOG(WARNING, VHOST_DATA,
> + "failed to make RARP; mbuf size too small %u (< %d)\n",
> + rarp_mbuf->buf_len, RARP_PKT_SIZE);
> + return -1;
> + }
> +
> + /* Ethernet header. */
> + eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
> + memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
> + ether_addr_copy(mac, ð_hdr->s_addr);
> + eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
> +
> + /* RARP header. */
> + rarp = (struct arp_hdr *)(eth_hdr + 1);
> + rarp->arp_hrd = htons(ARP_HRD_ETHER);
> + rarp->arp_pro = htons(ETHER_TYPE_IPv4);
> + rarp->arp_hln = ETHER_ADDR_LEN;
> + rarp->arp_pln = 4;
> + rarp->arp_op = htons(ARP_OP_REVREQUEST);
> +
> + ether_addr_copy(mac, &rarp->arp_data.arp_sha);
> + ether_addr_copy(mac, &rarp->arp_data.arp_tha);
> + memset(&rarp->arp_data.arp_sip, 0x00, 4);
> + memset(&rarp->arp_data.arp_tip, 0x00, 4);
> +
> + rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE;
> +
> + return 0;
> +}
> +
> uint16_t
> rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
> {
> - struct rte_mbuf *m, *prev;
> + struct rte_mbuf *m, *prev, *rarp_mbuf = NULL;
> struct vhost_virtqueue *vq;
> struct vring_desc *desc;
> uint64_t vb_addr = 0;
> @@ -788,11 +828,34 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> if (unlikely(vq->enabled == 0))
> return 0;
>
> + /*
> + * Construct a RARP broadcast packet, and inject it to the "pkts"
> + * array, to looks like that guest actually send such packet.
> + *
> + * Check user_send_rarp() for more information.
> + */
> + if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
> + &dev->broadcast_rarp.cnt, 1, 0))) {
> + rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
> + if (rarp_mbuf == NULL) {
> + RTE_LOG(ERR, VHOST_DATA,
> + "Failed to allocate memory for mbuf.\n");
> + return 0;
> + }
> +
> + if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
> + rte_pktmbuf_free(rarp_mbuf);
> + rarp_mbuf = NULL;
> + } else {
> + count -= 1;
> + }
> + }
> +
> avail_idx = *((volatile uint16_t *)&vq->avail->idx);
>
> /* If there are no available buffers then return. */
> if (vq->last_used_idx == avail_idx)
> - return 0;
> + goto out;
>
> LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
> dev->device_fh);
> @@ -983,8 +1046,21 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> vq->used->idx += entry_success;
> vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> sizeof(vq->used->idx));
> +
> /* Kick guest if required. */
> if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> eventfd_write(vq->callfd, (eventfd_t)1);
> +
> +out:
> + if (unlikely(rarp_mbuf != NULL)) {
> + /*
> + * Inject it to the head of "pkts" array, so that switch's mac
> + * learning table will get updated first.
> + */
> + memmove(&pkts[1], pkts, entry_success * sizeof(m));
> + pkts[0] = rarp_mbuf;
> + entry_success += 1;
> + }
> +
> return entry_success;
> }
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
> index de7eecb..df2bd64 100644
> --- a/lib/librte_vhost/vhost_user/vhost-net-user.c
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
> @@ -437,7 +437,7 @@ vserver_message_handler(int connfd, void *dat, int *remove)
> user_set_vring_enable(ctx, &msg.payload.state);
> break;
> case VHOST_USER_SEND_RARP:
> - user_send_rarp(&msg);
> + user_send_rarp(ctx, &msg);
> break;
>
> default:
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
> index 68b24f4..65b5652 100644
> --- a/lib/librte_vhost/vhost_user/virtio-net-user.c
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -39,12 +39,6 @@
> #include <sys/mman.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> -#include <sys/ioctl.h>
> -#include <sys/socket.h>
> -#include <net/ethernet.h>
> -#include <netinet/in.h>
> -#include <netinet/if_ether.h>
> -#include <linux/if_packet.h>
>
> #include <rte_common.h>
> #include <rte_log.h>
> @@ -415,120 +409,38 @@ user_set_log_base(struct vhost_device_ctx ctx,
> return 0;
> }
>
> -#define RARP_BUF_SIZE 64
> -
> -static void
> -make_rarp_packet(uint8_t *buf, uint8_t *mac)
> -{
> - struct ether_header *eth_hdr;
> - struct ether_arp *rarp;
> -
> - /* Ethernet header. */
> - eth_hdr = (struct ether_header *)buf;
> - memset(ð_hdr->ether_dhost, 0xff, ETH_ALEN);
> - memcpy(ð_hdr->ether_shost, mac, ETH_ALEN);
> - eth_hdr->ether_type = htons(ETH_P_RARP);
> -
> - /* RARP header. */
> - rarp = (struct ether_arp *)(eth_hdr + 1);
> - rarp->ea_hdr.ar_hrd = htons(ARPHRD_ETHER);
> - rarp->ea_hdr.ar_pro = htons(ETHERTYPE_IP);
> - rarp->ea_hdr.ar_hln = ETH_ALEN;
> - rarp->ea_hdr.ar_pln = 4;
> - rarp->ea_hdr.ar_op = htons(ARPOP_RREQUEST);
> -
> - memcpy(&rarp->arp_sha, mac, ETH_ALEN);
> - memset(&rarp->arp_spa, 0x00, 4);
> - memcpy(&rarp->arp_tha, mac, 6);
> - memset(&rarp->arp_tpa, 0x00, 4);
> -}
> -
> -
> -static void
> -send_rarp(const char *ifname, uint8_t *rarp)
> -{
> - int fd;
> - struct ifreq ifr;
> - struct sockaddr_ll addr;
> -
> - fd = socket(AF_PACKET, SOCK_RAW, 0);
> - if (fd < 0) {
> - perror("socket failed");
> - return;
> - }
> -
> - memset(&ifr, 0, sizeof(struct ifreq));
> - strncpy(ifr.ifr_name, ifname, IFNAMSIZ);
> - if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0) {
> - perror("failed to get interface index");
> - close(fd);
> - return;
> - }
> -
> - addr.sll_ifindex = ifr.ifr_ifindex;
> - addr.sll_halen = ETH_ALEN;
> -
> - if (sendto(fd, rarp, RARP_BUF_SIZE, 0,
> - (const struct sockaddr*)&addr, sizeof(addr)) < 0) {
> - perror("send rarp packet failed");
> - }
> -}
> -
> -
> /*
> - * Broadcast a RARP message to all interfaces, to update
> - * switch's mac table
> + * An rarp packet is constructed and broadcasted to notify switches about
> + * the new location of the migrated VM, so that packets from outside will
> + * not be lost after migration.
> + *
> + * However, we don't actually "send" a rarp packet here, instead, we set
> + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
> */
> int
> -user_send_rarp(struct VhostUserMsg *msg)
> +user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *msg)
> {
> + struct virtio_net *dev;
> uint8_t *mac = (uint8_t *)&msg->payload.u64;
> - uint8_t rarp[RARP_BUF_SIZE];
> - struct ifconf ifc = {0, };
> - struct ifreq *ifr;
> - int nr = 16;
> - int fd;
> - uint32_t i;
> +
> + dev = get_device(ctx);
> + if (!dev)
> + return -1;
>
> RTE_LOG(DEBUG, VHOST_CONFIG,
> ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
> mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
> -
> - make_rarp_packet(rarp, mac);
> + memcpy(dev->mac.addr_bytes, mac, 6);
>
> /*
> - * Get all interfaces
> + * Set the flag to inject a RARP broadcast packet at
> + * rte_vhost_dequeue_burst().
> + *
> + * rte_smp_wmb() is for making sure the mac is copied
> + * before the flag is set.
> */
> - fd = socket(AF_INET, SOCK_DGRAM, 0);
> - if (fd < 0) {
> - perror("failed to create AF_INET socket");
> - return -1;
> - }
> -
> -again:
> - ifc.ifc_len = sizeof(*ifr) * nr;
> - ifc.ifc_buf = realloc(ifc.ifc_buf, ifc.ifc_len);
> -
> - if (ioctl(fd, SIOCGIFCONF, &ifc) < 0) {
> - perror("failed at SIOCGIFCONF");
> - close(fd);
> - return -1;
> - }
> -
> - if (ifc.ifc_len == (int)sizeof(struct ifreq) * nr) {
> - /*
> - * current ifc_buf is not big enough to hold
> - * all interfaces; double it and try again.
> - */
> - nr *= 2;
> - goto again;
> - }
> -
> - ifr = (struct ifreq *)ifc.ifc_buf;
> - for (i = 0; i < ifc.ifc_len / sizeof(struct ifreq); i++)
> - send_rarp(ifr[i].ifr_name, rarp);
> -
> - close(fd);
> + rte_smp_wmb();
> + rte_atomic16_set(&dev->broadcast_rarp, 1);
>
> return 0;
> }
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h
> index 559bb46..cefec16 100644
> --- a/lib/librte_vhost/vhost_user/virtio-net-user.h
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h
> @@ -54,7 +54,7 @@ void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
> void user_set_protocol_features(struct vhost_device_ctx ctx,
> uint64_t protocol_features);
> int user_set_log_base(struct vhost_device_ctx ctx, struct VhostUserMsg *);
> -int user_send_rarp(struct VhostUserMsg *);
> +int user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *);
>
> int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
>
More information about the dev
mailing list