[dpdk-dev] [ovs-dev] [PATCH RFC] dpif-netdev: Add support Intel DPDK based ports.
Thomas Graf
tgraf at redhat.com
Wed Jan 29 11:01:32 CET 2014
On 01/28/2014 02:48 AM, pshelar at nicira.com wrote:
> From: Pravin B Shelar <pshelar at nicira.com>
>
> Following patch adds DPDK netdev-class to userspace datapath.
> Approach taken in this patch differs from Intel® DPDK vSwitch
> where DPDK datapath switching is done in saparate process. This
> patch adds support for DPDK type port and uses OVS userspace
> datapath for switching. Therefore all DPDK processing and flow
> miss handling is done in single process. This also avoids code
> duplication by reusing OVS userspace datapath switching and
> therefore it supports all flow matching and actions that
> user-space datapath supports. Refer to INSTALL.DPDK doc for
> further info.
>
> With this patch I got similar performance for netperf TCP_STREAM
> tests compared to kernel datapath.
>
> This is based a patch from Gerald Rogers.
>
> Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
> CC: "Gerald Rogers" <gerald.rogers at intel.com>
Pravin,
Some initial comments below. I will provide more after deeper
digging.
Do you have any ideas on how to implement the TX batching yet?
> +
> +static int
> +netdev_dpdk_rx_drain(struct netdev_rx *rx_)
> +{
> + struct netdev_rx_dpdk *rx = netdev_rx_dpdk_cast(rx_);
> + int pending;
> + int i;
> +
> + pending = rx->ofpbuf_cnt;
> + if (pending) {
This conditional seems unneeded.
> + for (i = 0; i < pending; i++) {
> + build_ofpbuf(rx, &rx->ofpbuf[i], NULL);
> + }
> + rx->ofpbuf_cnt = 0;
> + return 0;
> + }
> +
> + return 0;
> +}
> +
> +/* Tx function. Transmit packets indefinitely */
> +static int
> +dpdk_do_tx_copy(struct netdev *netdev, char *buf, int size)
> +{
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> + struct rte_mbuf *pkt;
> + uint32_t nb_tx = 0;
> +
> + pkt = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
> + if (!pkt) {
> + return 0;
Silent drop? ;-) Shouldn't these drops be accounted for somehow?
> + }
> +
> + /* We have to do a copy for now */
> + memcpy(pkt->pkt.data, buf, size);
> +
> + rte_pktmbuf_data_len(pkt) = size;
> + rte_pktmbuf_pkt_len(pkt) = size;
> +
> + rte_spinlock_lock(&dev->tx_lock);
What is the purpose of tx_lock here? Multiple threads writing to
the same Q? The lock is not acquired for the zerocopy path below.
> + nb_tx = rte_eth_tx_burst(dev->port_id, NR_QUEUE, &pkt, 1);
> + rte_spinlock_unlock(&dev->tx_lock);
> +
> + if (nb_tx != 1) {
> + /* free buffers if we couldn't transmit packets */
> + rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1);
> + }
> + return nb_tx;
> +}
> +
> +static int
> +netdev_dpdk_send(struct netdev *netdev,
> + struct ofpbuf *ofpbuf, bool may_steal)
> +{
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> + if (ofpbuf->size > dev->max_packet_len) {
> + VLOG_ERR("2big size %d max_packet_len %d",
> + (int)ofpbuf->size , dev->max_packet_len);
Should probably use VLOG_RATE_LIMIT_INIT
> + return E2BIG;
> + }
> +
> + rte_prefetch0(&ofpbuf->private_p);
> + if (!may_steal ||
> + !ofpbuf->private_p || ofpbuf->source != OFPBUF_DPDK) {
> + dpdk_do_tx_copy(netdev, (char *) ofpbuf->data, ofpbuf->size);
> + } else {
> + struct rte_mbuf *pkt;
> + uint32_t nb_tx;
> + int qid;
> +
> + pkt = ofpbuf->private_p;
> + ofpbuf->private_p = NULL;
> + rte_pktmbuf_data_len(pkt) = ofpbuf->size;
> + rte_pktmbuf_pkt_len(pkt) = ofpbuf->size;
> +
> + /* TODO: TX batching. */
> + qid = rte_lcore_id() % NR_QUEUE;
> + nb_tx = rte_eth_tx_burst(dev->port_id, qid, &pkt, 1);
> + if (nb_tx != 1) {
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> + rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1);
> + VLOG_ERR("TX error, zero packets sent");
Same here
> + }
> + }
> + return 0;
> +}
> +static int
> +netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
> +{
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> + int old_mtu, err;
> + struct dpdk_mp *old_mp;
> + struct dpdk_mp *mp;
> +
> + ovs_mutex_lock(&dpdk_mutex);
> + ovs_mutex_lock(&dev->mutex);
> + if (dev->mtu == mtu) {
> + err = 0;
> + goto out;
> + }
> +
> + mp = dpdk_mp_get(dev->socket_id, dev->mtu);
> + if (!mp) {
> + err = ENOMEM;
> + goto out;
> + }
> +
> + rte_eth_dev_stop(dev->port_id);
> +
> + old_mtu = dev->mtu;
> + old_mp = dev->dpdk_mp;
> + dev->dpdk_mp = mp;
> + dev->mtu = mtu;
> + dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
> +
> + err = dpdk_eth_dev_init(dev);
> + if (err) {
> +
> + dpdk_mp_put(mp);
> + dev->mtu = old_mtu;
> + dev->dpdk_mp = old_mp;
> + dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
> + dpdk_eth_dev_init(dev);
Would be nice if we don't need these constructs and DPDK would
provide an all or nothing init method.
> + goto out;
> + }
> +
> + dpdk_mp_put(old_mp);
> +out:
> + ovs_mutex_unlock(&dev->mutex);
> + ovs_mutex_unlock(&dpdk_mutex);
> + return err;
> +}
> +
> +static int
> +netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
> + enum netdev_flags off, enum netdev_flags on,
> + enum netdev_flags *old_flagsp)
> + OVS_REQUIRES(dev->mutex)
> +{
> + int err;
> +
> + if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
> + return EINVAL;
> + }
> +
> + *old_flagsp = dev->flags;
> + dev->flags |= on;
> + dev->flags &= ~off;
> +
> + if (dev->flags == *old_flagsp) {
> + return 0;
> + }
> +
> + rte_eth_dev_stop(dev->port_id);
> +
> + if (dev->flags & NETDEV_UP) {
> + err = rte_eth_dev_start(dev->port_id);
> + if (err)
> + return err;
> + }
I'm not a DPDK expert but is it required to restart the device
to change promisc settings or could we conditionally start and
stop based on the previous flags state?
> +
> + if (dev->flags & NETDEV_PROMISC) {
> + rte_eth_promiscuous_enable(dev->port_id);
> + rte_eth_allmulticast_enable(dev->port_id);
> + }
> +
> + return 0;
> +}
>
> +
> +static void
> +netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
> + const char *argv[], void *aux OVS_UNUSED)
> +{
> + bool up;
> +
> + if (!strcasecmp(argv[argc - 1], "up")) {
> + up = true;
> + } else if ( !strcasecmp(argv[argc - 1], "down")) {
> + up = false;
> + } else {
> + unixctl_command_reply_error(conn, "Invalid Admin State");
> + return;
> + }
> +
> + if (argc > 2) {
> + struct netdev *netdev = netdev_from_name(argv[1]);
For future refinement: Usability would be increased if either a
strict one interface argument is enforced or multiple interface
names could be passed in, e.g. set-admin-state dpdk0 dpdk1 up
or set-admin-state dpdk0 up dpdk1 up
As of now, dpdk1 is silently ignored which is not nice.
> + if (netdev && is_dpdk_class(netdev->netdev_class)) {
> + struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
> +
> + ovs_mutex_lock(&dpdk_dev->mutex);
> + netdev_dpdk_set_admin_state__(dpdk_dev, up);
> + ovs_mutex_unlock(&dpdk_dev->mutex);
> +
> + netdev_close(netdev);
> + } else {
> + unixctl_command_reply_error(conn, "Unknown Dummy Interface");
I think this should read "Not a DPDK Interface" or something similar.
> + netdev_close(netdev);
> + return;
> + }
> + } else {
> + struct netdev_dpdk *netdev;
> +
> + ovs_mutex_lock(&dpdk_mutex);
> + LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
> + ovs_mutex_lock(&netdev->mutex);
> + netdev_dpdk_set_admin_state__(netdev, up);
> + ovs_mutex_unlock(&netdev->mutex);
> + }
> + ovs_mutex_unlock(&dpdk_mutex);
> + }
> + unixctl_command_reply(conn, "OK");
> +}
> +
> +
> - retval = rx->netdev->netdev_class->rx_recv(rx, buffer);
> - if (!retval) {
> - COVERAGE_INC(netdev_received);
Are you removing the netdev_receive counter on purpose here?
More information about the dev
mailing list