[dpdk-dev] [ovs-dev] [PATCH RFC] dpif-netdev: Add support Intel DPDK based ports.

Thomas Graf tgraf at redhat.com
Wed Jan 29 11:01:32 CET 2014


On 01/28/2014 02:48 AM, pshelar at nicira.com wrote:
> From: Pravin B Shelar <pshelar at nicira.com>
>
> Following patch adds DPDK netdev-class to userspace datapath.
> Approach taken in this patch differs from Intel® DPDK vSwitch
> where DPDK datapath switching is done in saparate process.  This
> patch adds support for DPDK type port and uses OVS userspace
> datapath for switching.  Therefore all DPDK processing and flow
> miss handling is done in single process.  This also avoids code
> duplication by reusing OVS userspace datapath switching and
> therefore it supports all flow matching and actions that
> user-space datapath supports.  Refer to INSTALL.DPDK doc for
> further info.
>
> With this patch I got similar performance for netperf TCP_STREAM
> tests compared to kernel datapath.
>
> This is based a patch from Gerald Rogers.
>
> Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
> CC: "Gerald Rogers" <gerald.rogers at intel.com>

Pravin,

Some initial comments below. I will provide more after deeper
digging.

Do you have any ideas on how to implement the TX batching yet?

> +
> +static int
> +netdev_dpdk_rx_drain(struct netdev_rx *rx_)
> +{
> +    struct netdev_rx_dpdk *rx = netdev_rx_dpdk_cast(rx_);
> +    int pending;
> +    int i;
> +
> +    pending = rx->ofpbuf_cnt;
> +    if (pending) {

This conditional seems unneeded.

> +        for (i = 0; i < pending; i++) {
> +             build_ofpbuf(rx, &rx->ofpbuf[i], NULL);
> +        }
> +        rx->ofpbuf_cnt = 0;
> +        return 0;
> +    }
> +
> +    return 0;
> +}
> +
> +/* Tx function. Transmit packets indefinitely */
> +static int
> +dpdk_do_tx_copy(struct netdev *netdev, char *buf, int size)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    struct rte_mbuf *pkt;
> +    uint32_t nb_tx = 0;
> +
> +    pkt = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
> +    if (!pkt) {
> +        return 0;

Silent drop? ;-) Shouldn't these drops be accounted for somehow?

> +    }
> +
> +    /* We have to do a copy for now */
> +    memcpy(pkt->pkt.data, buf, size);
> +
> +    rte_pktmbuf_data_len(pkt) = size;
> +    rte_pktmbuf_pkt_len(pkt) = size;
> +
> +    rte_spinlock_lock(&dev->tx_lock);

What is the purpose of tx_lock here? Multiple threads writing to
the same Q? The lock is not acquired for the zerocopy path below.

> +    nb_tx = rte_eth_tx_burst(dev->port_id, NR_QUEUE, &pkt, 1);
> +    rte_spinlock_unlock(&dev->tx_lock);
> +
> +    if (nb_tx != 1) {
> +        /* free buffers if we couldn't transmit packets */
> +        rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1);
> +    }
> +    return nb_tx;
> +}
> +
> +static int
> +netdev_dpdk_send(struct netdev *netdev,
> +                 struct ofpbuf *ofpbuf, bool may_steal)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    if (ofpbuf->size > dev->max_packet_len) {
> +        VLOG_ERR("2big size %d max_packet_len %d",
> +                  (int)ofpbuf->size , dev->max_packet_len);

Should probably use VLOG_RATE_LIMIT_INIT

> +        return E2BIG;
> +    }
> +
> +    rte_prefetch0(&ofpbuf->private_p);
> +    if (!may_steal ||
> +        !ofpbuf->private_p || ofpbuf->source != OFPBUF_DPDK) {
> +        dpdk_do_tx_copy(netdev, (char *) ofpbuf->data, ofpbuf->size);
> +    } else {
> +        struct rte_mbuf *pkt;
> +        uint32_t nb_tx;
> +        int qid;
> +
> +        pkt = ofpbuf->private_p;
> +        ofpbuf->private_p = NULL;
> +        rte_pktmbuf_data_len(pkt) = ofpbuf->size;
> +        rte_pktmbuf_pkt_len(pkt) = ofpbuf->size;
> +
> +        /* TODO: TX batching. */
> +        qid = rte_lcore_id() % NR_QUEUE;
> +        nb_tx = rte_eth_tx_burst(dev->port_id, qid, &pkt, 1);
> +        if (nb_tx != 1) {
> +            struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +            rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1);
> +            VLOG_ERR("TX error, zero packets sent");

Same here

> +       }
> +    }
> +    return 0;
> +}

> +static int
> +netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    int old_mtu, err;
> +    struct dpdk_mp *old_mp;
> +    struct dpdk_mp *mp;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    ovs_mutex_lock(&dev->mutex);
> +    if (dev->mtu == mtu) {
> +        err = 0;
> +        goto out;
> +    }
> +
> +    mp = dpdk_mp_get(dev->socket_id, dev->mtu);
> +    if (!mp) {
> +        err = ENOMEM;
> +        goto out;
> +    }
> +
> +    rte_eth_dev_stop(dev->port_id);
> +
> +    old_mtu = dev->mtu;
> +    old_mp = dev->dpdk_mp;
> +    dev->dpdk_mp = mp;
> +    dev->mtu = mtu;
> +    dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
> +
> +    err = dpdk_eth_dev_init(dev);
> +    if (err) {
> +
> +        dpdk_mp_put(mp);
> +        dev->mtu = old_mtu;
> +        dev->dpdk_mp = old_mp;
> +        dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
> +        dpdk_eth_dev_init(dev);

Would be nice if we don't need these constructs and DPDK would
provide an all or nothing init method.

> +        goto out;
> +    }
> +
> +    dpdk_mp_put(old_mp);
> +out:
> +    ovs_mutex_unlock(&dev->mutex);
> +    ovs_mutex_unlock(&dpdk_mutex);
> +    return err;
> +}
> +

> +static int
> +netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
> +                           enum netdev_flags off, enum netdev_flags on,
> +                           enum netdev_flags *old_flagsp)
> +    OVS_REQUIRES(dev->mutex)
> +{
> +    int err;
> +
> +    if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
> +        return EINVAL;
> +    }
> +
> +    *old_flagsp = dev->flags;
> +    dev->flags |= on;
> +    dev->flags &= ~off;
> +
> +    if (dev->flags == *old_flagsp) {
> +        return 0;
> +    }
> +
> +    rte_eth_dev_stop(dev->port_id);
> +
> +    if (dev->flags & NETDEV_UP) {
> +        err = rte_eth_dev_start(dev->port_id);
> +        if (err)
> +            return err;
> +    }

I'm not a DPDK expert but is it required to restart the device
to change promisc settings or could we conditionally start and
stop based on the previous flags state?

> +
> +    if (dev->flags & NETDEV_PROMISC) {
> +        rte_eth_promiscuous_enable(dev->port_id);
> +        rte_eth_allmulticast_enable(dev->port_id);
> +    }
> +
> +    return 0;
> +}
>
> +
> +static void
> +netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
> +                            const char *argv[], void *aux OVS_UNUSED)
> +{
> +    bool up;
> +
> +    if (!strcasecmp(argv[argc - 1], "up")) {
> +        up = true;
> +    } else if ( !strcasecmp(argv[argc - 1], "down")) {
> +        up = false;
> +    } else {
> +        unixctl_command_reply_error(conn, "Invalid Admin State");
> +        return;
> +    }
> +
> +    if (argc > 2) {
> +        struct netdev *netdev = netdev_from_name(argv[1]);

For future refinement: Usability would be increased if either a
strict one interface argument is enforced or multiple interface
names could be passed in, e.g. set-admin-state dpdk0 dpdk1 up
or set-admin-state dpdk0 up dpdk1 up

As of now, dpdk1 is silently ignored which is not nice.

> +        if (netdev && is_dpdk_class(netdev->netdev_class)) {
> +            struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
> +
> +            ovs_mutex_lock(&dpdk_dev->mutex);
> +            netdev_dpdk_set_admin_state__(dpdk_dev, up);
> +            ovs_mutex_unlock(&dpdk_dev->mutex);
> +
> +            netdev_close(netdev);
> +        } else {
> +            unixctl_command_reply_error(conn, "Unknown Dummy Interface");

I think this should read "Not a DPDK Interface" or something similar.


> +            netdev_close(netdev);
> +            return;
> +        }
> +    } else {
> +        struct netdev_dpdk *netdev;
> +
> +        ovs_mutex_lock(&dpdk_mutex);
> +        LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
> +            ovs_mutex_lock(&netdev->mutex);
> +            netdev_dpdk_set_admin_state__(netdev, up);
> +            ovs_mutex_unlock(&netdev->mutex);
> +        }
> +        ovs_mutex_unlock(&dpdk_mutex);
> +    }
> +    unixctl_command_reply(conn, "OK");
> +}
> +
> +
> -    retval = rx->netdev->netdev_class->rx_recv(rx, buffer);
> -    if (!retval) {
> -        COVERAGE_INC(netdev_received);

Are you removing the netdev_receive counter on purpose here?


More information about the dev mailing list