[dpdk-dev] [PATCH v4] drivers/net:new PMD using tun/tap host interface
Wiles, Keith
keith.wiles at intel.com
Tue Oct 11 23:07:35 CEST 2016
Regards,
Keith
> On Oct 11, 2016, at 6:49 AM, Yigit, Ferruh <ferruh.yigit at intel.com> wrote:
>
> On 10/4/2016 3:45 PM, Keith Wiles wrote:
>> The rte_eth_tap.c PMD creates a device using TUN/TAP interfaces
>> on the local host. The PMD allows for DPDK and the host to
>> communicate using a raw device interface on the host and in
>> the DPDK application. The device created is a Tap device with
>> a L2 packet header.
>>
Will try to ship out a v5 soon.
>> v4 - merge with latest driver changes
>> v3 - fix includes by removing ifdef for other type besides Linux.
>> Fix the copyright notice in the Makefile
>> v2 - merge all of the patches into one patch.
>> Fix a typo on naming the tap device.
>> Update the maintainers list
>>
>> Signed-off-by: Keith Wiles <keith.wiles at intel.com>
>> ---
>> MAINTAINERS | 5 +
>> config/common_linuxapp | 2 +
>> doc/guides/nics/tap.rst | 84 ++++
>> drivers/net/Makefile | 1 +
>> drivers/net/tap/Makefile | 57 +++
>> drivers/net/tap/rte_eth_tap.c | 866 ++++++++++++++++++++++++++++++++
>> drivers/net/tap/rte_pmd_tap_version.map | 4 +
>> mk/rte.app.mk | 1 +
>> 8 files changed, 1020 insertions(+)
>> create mode 100644 doc/guides/nics/tap.rst
>> create mode 100644 drivers/net/tap/Makefile
>> create mode 100644 drivers/net/tap/rte_eth_tap.c
>> create mode 100644 drivers/net/tap/rte_pmd_tap_version.map
>>
> <>
>> diff --git a/config/common_linuxapp b/config/common_linuxapp
>> index 2483dfa..59a2053 100644
>> --- a/config/common_linuxapp
>> +++ b/config/common_linuxapp
>> @@ -44,3 +44,5 @@ CONFIG_RTE_LIBRTE_PMD_VHOST=y
>> CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
>> CONFIG_RTE_LIBRTE_POWER=y
>> CONFIG_RTE_VIRTIO_USER=y
>> +CONFIG_RTE_LIBRTE_PMD_TAP=y
>
> According existing config items, a default value of a config option
> should go to config/common_base, and environment specific config file
> overwrites it if required.
> So this option needs to be added into config/common_base too as disabled
> by default.
Add the define to common_base as no, plus a comment for Linux only.
>
>> +CONFIG_RTE_PMD_TAP_MAX_QUEUES=32
Moved this to the .c file as a define.
>
> Is the number of max queues really needs to be a config option, I assume
> in normal use case user won't need to update this and will use single
> queue, if that is true what about pushing this into source code to not
> make config file more complex?
>
>> diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
>
> <...>
>
>> +.. code-block:: console
>> +
>> + The interfaced name can be changed by adding the iface=foo0
>> + e.g. --vedv=eth_tap,iface=foo0 --vdev=eth_tap,iface=foo1, ...
>
> s/vedv/vdev
> eth_tap needs to be net_tap as part of unifying device names work
Fixed.
>
> <...>
>
>> diff --git a/drivers/net/Makefile b/drivers/net/Makefile
>> index bc93230..b4afa98 100644
>> --- a/drivers/net/Makefile
>> +++ b/drivers/net/Makefile
>> @@ -55,6 +55,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD) += thunderx
>> DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio
>> DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += vmxnet3
>> DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += xenvirt
>> +DIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap
>
> Rest of the PMDs sorted alphabetically, please do same.
Done.
>
>>
>> ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
>> DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost
>
> <...>
>
>> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
>
> <...>
>
>> +
>> +static const char *drivername = "Tap PMD";
>> +static int tap_unit = 0;
>
> No need to initialize to zero.
Fixed
>
> <...>
>
>> +
>> +struct pmd_internals {
>> + char name[RTE_ETH_NAME_MAX_LEN]; /* Internal Tap device name */
>> + uint16_t nb_queues; /* Number of queues supported */
>> + uint16_t pad0;
>
> Why this padding? Is it reserved?
Removed pad0. I just like to know about gaps in the structures is the reason.
>
>> + struct ether_addr eth_addr; /* Mac address of the device port */
>> +
>> + int if_index; /* IF_INDEX for the port */
>> + int fds[RTE_PMD_TAP_MAX_QUEUES]; /* List of all file descriptors */
>> +
>> + struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
>> + struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
>> +};
>> +
>> +/*
>> + * Tun/Tap allocation routine
>> + *
>> + * name is the number of the interface to use, unless NULL to take the host
>> + * supplied name.
>> + */
>> +static int
>> +tun_alloc(char * name)
>
> char *name
Fixed.
>
> <...>
>
>> +
>> + /* Always set the fiile descriptor to non-blocking */
>
> s/fiile/file
>
>> + if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
>> + RTE_LOG(ERR, PMD, "Unable to set to nonblocking\n");
>> + perror("F_SETFL, NONBLOCK");
>> + goto error;
>> + }
>> +
>> + /* If the name is different that new name as default */
>> + if (name && strcmp(name, ifr.ifr_name))
>> + strcpy(name, ifr.ifr_name);
> What about more secure copy?
Changed to be more secure.
>
>> +
>> + return fd;
>> +
>> +error:
>> + if (fd > 0)
>> + close(fd);
>> + return -1;
>> +}
>> +
>
> <...>
>
>> +
>> +static void
>> +tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
>> +{
>> + struct pmd_internals *internals = dev->data->dev_private;
>> +
>> + dev_info->driver_name = drivername;
>> + dev_info->if_index = internals->if_index;
>> + dev_info->max_mac_addrs = 1;
>> + dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
>> + dev_info->max_rx_queues = (uint16_t)internals->nb_queues;
>> + dev_info->max_tx_queues = (uint16_t)internals->nb_queues;
> casting to uint16_t is not requires, it is already uint16_t.
Removed
>
>> + dev_info->min_rx_bufsize = 0;
>> + dev_info->pci_dev = NULL;
>> +}
>> +
>> +static void
>> +tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
> igb_stats?
>
>> +{
>> + unsigned i, imax;
>> + unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
>> + unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
>> + const struct pmd_internals *internal = dev->data->dev_private;
>> +
>> + imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
>> + internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
>> +
>> + for (i = 0; i < imax; i++) {
>> + igb_stats->q_ipackets[i] = internal->rxq[i].stats.ipackets;
>> + igb_stats->q_ibytes[i] = internal->rxq[i].stats.ibytes;
>> + rx_total += igb_stats->q_ipackets[i];
>> + rx_bytes_total += igb_stats->q_ibytes[i];
>> + }
>> +
>> + imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
>> + internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
> Do we need to duplicate imax calculation?
Removed
>
>
>> +
>> + for (i = 0; i < imax; i++) {
>> + igb_stats->q_opackets[i] = internal->txq[i].stats.opackets;
>> + igb_stats->q_errors[i] = internal->txq[i].stats.errs;
>> + igb_stats->q_obytes[i] = internal->txq[i].stats.obytes;
>> + tx_total += igb_stats->q_opackets[i];
>> + tx_err_total += igb_stats->q_errors[i];
>> + tx_bytes_total += igb_stats->q_obytes[i];
>> + }
>> +
>> + igb_stats->ipackets = rx_total;
>> + igb_stats->ibytes = rx_bytes_total;
>> + igb_stats->opackets = tx_total;
>> + igb_stats->oerrors = tx_err_total;
>> + igb_stats->obytes = tx_bytes_total;
>> +}
>> +
>
> <...>
>
>> +
>> +static int
>> +rte_eth_dev_create(const char *name,
>> + struct rte_eth_dev **eth_dev,
>> + const struct eth_dev_ops *dev_ops,
>> + void **internals, size_t internal_size,
>> + uint16_t flag)
>> +{
>> + char buff[RTE_ETH_NAME_MAX_LEN];
>> + int numa_node = rte_socket_id();
>> + struct rte_eth_dev *dev = NULL;
>> + struct rte_eth_dev_data *data = NULL;
>> + void *priv = NULL;
>> +
>> + if ((name == NULL) || (eth_dev == NULL) || (dev_ops == NULL) ||
>> + (internals == NULL) || (internal_size == 0)) {
>> + RTE_PMD_DEBUG_TRACE("Paramters are invalid\n");
>> + return -1;
>> + }
>> +
>> + dev = rte_eth_dev_allocate(name);
>> + if (dev == NULL) {
>> + RTE_PMD_DEBUG_TRACE("%s: rte_eth_dev_allocate failed for %s\n",
>> + name, buff);
>> + goto error;
>> + }
>> +
>> + if (flag & RTE_USE_PRIVATE_DATA) {
>
> You may need to save this flag value somewhere in internals, to decide
> how to free data later.
Let me look into this one more and see if it is required at all.
>
>> + /*
>> + * now do all data allocation - for eth_dev structure, dummy
>> + * pci driver and internal (private) data
>> + */
>> + snprintf(buff, sizeof(buff), "D-%s-%d", name, numa_node);
>> + data = rte_zmalloc_socket(buff, sizeof(struct rte_eth_dev_data),
>> + 0, numa_node);
>> + if (data == NULL) {
>> + RTE_PMD_DEBUG_TRACE("%s: Unable to allocate memory\n",
>> + name);
>> + goto error;
>> + }
>> + /* move the current state of the structure to the new one */
>> + rte_memcpy(data, dev->data, sizeof(struct rte_eth_dev_data));
> Why do we need to copy, trying to preserve which data?
>
>> + dev->data = data; /* Override the current data pointer */
>> + } else
>> + data = dev->data;
>> +
>> + snprintf(buff, sizeof(buff), "I-%s-%d", name, numa_node);
>> + priv = rte_zmalloc_socket(buff, internal_size, 0, numa_node);
>> + if (priv == NULL) {
>> + RTE_PMD_DEBUG_TRACE("Unable to allocate internal memory %lu\n",
>> + internal_size);
>> + goto error;
>> + }
>> +
>> + /* Setup some default values */
>> + dev->dev_ops = dev_ops;
>> + data->dev_private = priv;
>
>> + data->port_id = dev->data->port_id;
>> + memmove(data->name, dev->data->name, strlen(dev->data->name));
> These two assignments are useless, needs to be done before "dev->data =
> data" assignment.
Reworked this code area to remove it.
>
>> +
>> + dev->driver = NULL;
>> + data->dev_flags = RTE_ETH_DEV_DETACHABLE;
>> + data->kdrv = RTE_KDRV_NONE;
>> + data->numa_node = numa_node;
>> +
>> + *eth_dev = dev;
>> + *internals = priv;
>> +
>> + return 0;
>> +error:
>> + rte_free(priv);
>> +
>> + if (flag & RTE_USE_PRIVATE_DATA)
>> + rte_free(data);
>> +
>> + rte_eth_dev_release_port(dev);
>> +
>> + return -1;
>> +}
>> +
>> +static int
>> +pmd_init_internals(const char *name, struct tap_info *tap,
>> + struct pmd_internals **internals,
>> + struct rte_eth_dev **eth_dev)
>> +{
>> + struct rte_eth_dev *dev = NULL;
>> + struct pmd_internals *internal = NULL;
>> + struct rte_eth_dev_data *data = NULL;
>> + int ret, i, fd = -1;
>> +
>> + RTE_LOG(INFO, PMD,
>> + "%s: Create TUN/TAP Ethernet device with %d queues on numa %u\n",
>> + name, RTE_PMD_TAP_MAX_QUEUES, rte_socket_id());
>> +
>> + pmd_link.link_speed = tap->speed;
>> +
>> + ret = rte_eth_dev_create(tap->name, &dev, &ops,
>> + (void **)&internal, sizeof(struct pmd_internals),
> Why rte_eth_dev_create() get "void **internals" which requires casting,
> but not "struct pmd_internals **internals” ?
Fixed.
>
>> + RTE_USE_PRIVATE_DATA);
>> + if (ret < 0)
>> + return -1;
>> +
>> + strncpy(internal->name, tap->name, sizeof(internal->name));
>> +
>> + internal->nb_queues = RTE_PMD_TAP_MAX_QUEUES;
>> +
>> + /* Create the first Tap device */
>> + if ((fd = tun_alloc(dev->data->name)) < 0) {
>> + RTE_LOG(ERR, PMD, "tun_alloc(%s) failed\n", dev->data->name);
>> + rte_free(internal);
> rte_free(dev->data); ?
> But needs to check RTE_USE_PRIVATE_DATA ..
See above
>
>> + rte_eth_dev_release_port(dev);
>> + return -1;
>> + }
>> +
>> + /* Presetup the fds to -1 as being not working */
>> + for(i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
>> + internal->fds[i] = -1;
>> + internal->rxq[i].fd = -1;
>> + internal->txq[i].fd = -1;
>> + }
>> +
>> + /* Take the TUN/TAP fd and place in the first location */
>> + internal->rxq[0].fd = fd;
>> + internal->txq[0].fd = fd;
>> + internal->fds[0] = fd;
>> +
>> + if (pmd_mac_address(fd, dev, &internal->eth_addr) < 0) {
>> + rte_free(internal);
> rte_free(dev->data); ?
Yes Added.
>
>> + rte_eth_dev_release_port(dev);
>> + return -1;
>> + }
>> +
>> + data = dev->data;
>> +
>> + data->dev_link = pmd_link;
>> + data->mac_addrs = &internal->eth_addr;
>> +
>> + data->nb_rx_queues = (uint16_t)internal->nb_queues;
>> + data->nb_tx_queues = (uint16_t)internal->nb_queues;
> no cast required.
Removed
>
>> + data->drv_name = drivername;
>> +
>> + *eth_dev = dev;
>> + *internals = internal;
>> +
>> + return 0;
>> +}
>> +
>
> <...>
>
>> +
>> +static int
>> +set_interface_speed(const char *key __rte_unused,
>> + const char *value,
>> + void *extra_args __rte_unused)
> need to drop __rte_unused for extra_args
>
>> +{
>> + struct tap_info *tap = (struct tap_info *)extra_args;
>> +
>> + pmd_link.link_speed = (value) ? atoi(value) : ETH_SPEED_NUM_10G;
>> + tap->speed = pmd_link.link_speed;
>> +
>> + return 0;
>> +}
>> +
>> +/*
>> + * Open a TAP interface device.
>> + */
>> +static int
>> +rte_pmd_tap_devinit(const char *name, const char *params)
>> +{
>> + int ret = 0;
>> + struct rte_kvargs *kvlist;
>> + struct tap_info tap_info;
>> +
>> + /* Setup default values */
>> + memset(&tap_info, 0, sizeof(tap_info));
>> +
>> + tap_info.speed = ETH_SPEED_NUM_10G;
>> + snprintf(tap_info.name, sizeof(tap_info.name), "dtap%d", tap_unit++);
> What about extracting iface name "dtap" into a macro to make it more
> visible.
Created macro for the default name.
>
>> +
>> + if ((params == NULL) || (params[0] == '\0')) {
>> + RTE_LOG(INFO, PMD, "Initializing pmd_tap for %s\n", name);
>> +
>> + ret = eth_dev_tap_create(name, &tap_info);
> This "name" is not used at all (except from RTE_LOG), instead tap->name
> is used for interface name, so why carying this variable around?
Fixed and changed the API.
>
>> + goto leave;
>> + }
>> +
>> + RTE_LOG(INFO, PMD, "Initialize %s with params (%s)\n", name, params);
>> +
>> + kvlist = rte_kvargs_parse(params, valid_arguments);
>> + if (!kvlist) {
>> + ret = eth_dev_tap_create(name, &tap_info);
>> + goto leave;
>> + }
>> +
>> + if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) {
>> + ret = rte_kvargs_process(kvlist, ETH_TAP_SPEED_ARG,
>> + &set_interface_speed, &tap_info);
>> + if (ret < 0)
>> + goto leave;
>> + } else
>> + set_interface_speed(NULL, NULL, &tap_info);
> This call is redundant, tap_info already has default speed value set.
Removed
>
>> +
>> + if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
>> + ret = rte_kvargs_process(kvlist, ETH_TAP_IFACE_ARG,
>> + &set_interface_name, &tap_info);
>> + if (ret < 0)
>> + goto leave;
>> + } else
>> + set_interface_name(NULL, NULL, (void *)&tap_info);
> tap_info->name already set to default value (dtap%d), this call is not
> required.
Removed
>
>> +
>> + rte_kvargs_free(kvlist);
>> +
>> +leave:
>> + if (ret == -1)
>> + RTE_LOG(INFO, PMD, "Failed to create pmd_tap for %s\n", name);
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * detach a TAP device.
>> + */
>> +static int
>> +rte_pmd_tap_devuninit(const char *name)
>> +{
>> + struct rte_eth_dev *eth_dev = NULL;
>> + struct pmd_internals *internals;
>> + int i;
>> +
>> + RTE_LOG(INFO, PMD, "Closing TUN/TAP Ethernet device on numa %u\n",
>> + rte_socket_id());
>> +
>> + if (name == NULL)
> This check is redundant, eal layer won't call this function with "name
> == NULL”
Removed
>
>> + return 0;
>> +
>> + /* find the ethdev entry */
>> + eth_dev = rte_eth_dev_allocated(name);
>> + if (eth_dev == NULL)
>> + return 0;
>> +
>> + internals = eth_dev->data->dev_private;
>> + for (i = 0; i < internals->nb_queues; i++)
>> + if (internals->fds[i] != -1)
>> + close(internals->fds[i]);
>> +
>> + rte_free(eth_dev->data->dev_private);
>> + rte_free(eth_dev->data);
> data can be shared?
> Don't we need a RTE_USE_PRIVATE_DATA flag check?
>
>> +
>> + rte_eth_dev_release_port(eth_dev);
>> +
>> + return 0;
>> +}
>> +
>> +static struct rte_vdev_driver pmd_tap_drv = {
>> + .init = rte_pmd_tap_devinit,
>> + .uninit = rte_pmd_tap_devuninit,
>> +};
>> +
>> +DRIVER_REGISTER_VDEV(eth_tap, pmd_tap_drv);
> name convention is now: “net_tap"
Fixed
>
>> +DRIVER_REGISTER_PARAM_STRING(eth_tap,
>> + "iface=<string>,speed=N");
>> diff --git a/drivers/net/tap/rte_pmd_tap_version.map b/drivers/net/tap/rte_pmd_tap_version.map
>> new file mode 100644
>> index 0000000..61463bf
>> --- /dev/null
>> +++ b/drivers/net/tap/rte_pmd_tap_version.map
>> @@ -0,0 +1,4 @@
>> +DPDK_16.11 {
>> +
>> + local: *;
>> +};
>> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
>> index 1a0095b..bd1d10f 100644
>> --- a/mk/rte.app.mk
>> +++ b/mk/rte.app.mk
>> @@ -129,6 +129,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += -lrte_pmd_vhost
>> endif # $(CONFIG_RTE_LIBRTE_VHOST)
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += -lrte_pmd_vmxnet3_uio
>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += -lrte_pmd_tap
> please put in alphebetical order
Done
>
>>
>> ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB) += -lrte_pmd_aesni_mb
More information about the dev
mailing list