[dpdk-dev] [PATCH v4] drivers/net:new PMD using tun/tap host interface

Ferruh Yigit ferruh.yigit at intel.com
Tue Oct 11 13:49:51 CEST 2016


On 10/4/2016 3:45 PM, Keith Wiles wrote:
> The rte_eth_tap.c PMD creates a device using TUN/TAP interfaces
> on the local host. The PMD allows for DPDK and the host to
> communicate using a raw device interface on the host and in
> the DPDK application. The device created is a Tap device with
> a L2 packet header.
> 
> v4 - merge with latest driver changes
> v3 - fix includes by removing ifdef for other type besides Linux.
>      Fix the copyright notice in the Makefile
> v2 - merge all of the patches into one patch.
>      Fix a typo on naming the tap device.
>      Update the maintainers list
> 
> Signed-off-by: Keith Wiles <keith.wiles at intel.com>
> ---
>  MAINTAINERS                             |   5 +
>  config/common_linuxapp                  |   2 +
>  doc/guides/nics/tap.rst                 |  84 ++++
>  drivers/net/Makefile                    |   1 +
>  drivers/net/tap/Makefile                |  57 +++
>  drivers/net/tap/rte_eth_tap.c           | 866 ++++++++++++++++++++++++++++++++
>  drivers/net/tap/rte_pmd_tap_version.map |   4 +
>  mk/rte.app.mk                           |   1 +
>  8 files changed, 1020 insertions(+)
>  create mode 100644 doc/guides/nics/tap.rst
>  create mode 100644 drivers/net/tap/Makefile
>  create mode 100644 drivers/net/tap/rte_eth_tap.c
>  create mode 100644 drivers/net/tap/rte_pmd_tap_version.map
> 
<>
> diff --git a/config/common_linuxapp b/config/common_linuxapp
> index 2483dfa..59a2053 100644
> --- a/config/common_linuxapp
> +++ b/config/common_linuxapp
> @@ -44,3 +44,5 @@ CONFIG_RTE_LIBRTE_PMD_VHOST=y
>  CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
>  CONFIG_RTE_LIBRTE_POWER=y
>  CONFIG_RTE_VIRTIO_USER=y
> +CONFIG_RTE_LIBRTE_PMD_TAP=y

According existing config items, a default value of a config option
should go to config/common_base, and environment specific config file
overwrites it if required.
So this option needs to be added into config/common_base too as disabled
by default.

> +CONFIG_RTE_PMD_TAP_MAX_QUEUES=32

Is the number of max queues really needs to be a config option, I assume
in normal use case user won't need to update this and will use single
queue, if that is true what about pushing this into source code to not
make config file more complex?

> diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst

<...>

> +.. code-block:: console
> +
> +   The interfaced name can be changed by adding the iface=foo0
> +   e.g. --vedv=eth_tap,iface=foo0 --vdev=eth_tap,iface=foo1, ...

s/vedv/vdev
eth_tap needs to be net_tap as part of unifying device names work

<...>

> diff --git a/drivers/net/Makefile b/drivers/net/Makefile
> index bc93230..b4afa98 100644
> --- a/drivers/net/Makefile
> +++ b/drivers/net/Makefile
> @@ -55,6 +55,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD) += thunderx
>  DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio
>  DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += vmxnet3
>  DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += xenvirt
> +DIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap

Rest of the PMDs sorted alphabetically, please do same.

>  
>  ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
>  DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost

<...>

> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c

<...>

> +
> +static const char *drivername = "Tap PMD";
> +static int tap_unit = 0;

No need to initialize to zero.

<...>

> +
> +struct pmd_internals {
> +	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
> +	uint16_t nb_queues;			/* Number of queues supported */
> +	uint16_t pad0;

Why this padding? Is it reserved?

> +	struct ether_addr eth_addr;	/* Mac address of the device port */
> +
> +	int if_index;			/* IF_INDEX for the port */
> +	int fds[RTE_PMD_TAP_MAX_QUEUES]; /* List of all file descriptors */
> +
> +	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
> +	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
> +};
> +
> +/*
> + * Tun/Tap allocation routine
> + *
> + * name is the number of the interface to use, unless NULL to take the host
> + * supplied name.
> + */
> +static int
> +tun_alloc(char * name)

char *name

<...>

> +
> +	/* Always set the fiile descriptor to non-blocking */

s/fiile/file

> +	if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
> +		RTE_LOG(ERR, PMD, "Unable to set to nonblocking\n");
> +		perror("F_SETFL, NONBLOCK");
> +		goto error;
> +	}
> +
> +	/* If the name is different that new name as default */
> +	if (name && strcmp(name, ifr.ifr_name))
> +		strcpy(name, ifr.ifr_name);
What about more secure copy?

> +
> +	return fd;
> +
> +error:
> +	if (fd > 0)
> +		close(fd);
> +	return -1;
> +}
> +

<...>

> +
> +static void
> +tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
> +{
> +	struct pmd_internals *internals = dev->data->dev_private;
> +
> +	dev_info->driver_name = drivername;
> +	dev_info->if_index = internals->if_index;
> +	dev_info->max_mac_addrs = 1;
> +	dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
> +	dev_info->max_rx_queues = (uint16_t)internals->nb_queues;
> +	dev_info->max_tx_queues = (uint16_t)internals->nb_queues;
casting to uint16_t is not requires, it is already uint16_t.

> +	dev_info->min_rx_bufsize = 0;
> +	dev_info->pci_dev = NULL;
> +}
> +
> +static void
> +tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
igb_stats?

> +{
> +	unsigned i, imax;
> +	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
> +	unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
> +	const struct pmd_internals *internal = dev->data->dev_private;
> +
> +	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
> +		internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
> +
> +	for (i = 0; i < imax; i++) {
> +		igb_stats->q_ipackets[i] = internal->rxq[i].stats.ipackets;
> +		igb_stats->q_ibytes[i] = internal->rxq[i].stats.ibytes;
> +		rx_total += igb_stats->q_ipackets[i];
> +		rx_bytes_total += igb_stats->q_ibytes[i];
> +	}
> +
> +	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
> +		internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
Do we need to duplicate imax calculation?


> +
> +	for (i = 0; i < imax; i++) {
> +		igb_stats->q_opackets[i] = internal->txq[i].stats.opackets;
> +		igb_stats->q_errors[i] = internal->txq[i].stats.errs;
> +		igb_stats->q_obytes[i] = internal->txq[i].stats.obytes;
> +		tx_total += igb_stats->q_opackets[i];
> +		tx_err_total += igb_stats->q_errors[i];
> +		tx_bytes_total += igb_stats->q_obytes[i];
> +	}
> +
> +	igb_stats->ipackets = rx_total;
> +	igb_stats->ibytes = rx_bytes_total;
> +	igb_stats->opackets = tx_total;
> +	igb_stats->oerrors = tx_err_total;
> +	igb_stats->obytes = tx_bytes_total;
> +}
> +

<...>

> +
> +static int
> +rte_eth_dev_create(const char *name,
> +		   struct rte_eth_dev **eth_dev,
> +		   const struct eth_dev_ops *dev_ops,
> +		   void **internals, size_t internal_size,
> +		   uint16_t flag)
> +{
> +	char buff[RTE_ETH_NAME_MAX_LEN];
> +	int numa_node = rte_socket_id();
> +	struct rte_eth_dev *dev = NULL;
> +	struct rte_eth_dev_data *data = NULL;
> +	void *priv = NULL;
> +
> +	if ((name == NULL) || (eth_dev == NULL) || (dev_ops == NULL) ||
> +	    (internals == NULL) || (internal_size == 0)) {
> +		RTE_PMD_DEBUG_TRACE("Paramters are invalid\n");
> +		return -1;
> +	}
> +
> +	dev = rte_eth_dev_allocate(name);
> +	if (dev == NULL) {
> +		RTE_PMD_DEBUG_TRACE("%s: rte_eth_dev_allocate failed for %s\n",
> +				    name, buff);
> +		goto error;
> +	}
> +
> +	if (flag & RTE_USE_PRIVATE_DATA) {

You may need to save this flag value somewhere in internals, to decide
how to free data later.

> +		/*
> +		 * now do all data allocation - for eth_dev structure, dummy
> +		 * pci driver and internal (private) data
> +		 */
> +		snprintf(buff, sizeof(buff), "D-%s-%d", name, numa_node);
> +		data = rte_zmalloc_socket(buff, sizeof(struct rte_eth_dev_data),
> +					  0, numa_node);
> +		if (data == NULL) {
> +			RTE_PMD_DEBUG_TRACE("%s: Unable to allocate memory\n",
> +					    name);
> +			goto error;
> +		}
> +		/* move the current state of the structure to the new one */
> +		rte_memcpy(data, dev->data, sizeof(struct rte_eth_dev_data));
Why do we need to copy, trying to preserve which data?

> +		dev->data = data;	/* Override the current data pointer */
> +	} else
> +		data = dev->data;
> +
> +	snprintf(buff, sizeof(buff), "I-%s-%d", name, numa_node);
> +	priv = rte_zmalloc_socket(buff, internal_size, 0, numa_node);
> +	if (priv == NULL) {
> +		RTE_PMD_DEBUG_TRACE("Unable to allocate internal memory %lu\n",
> +				    internal_size);
> +		goto error;
> +	}
> +
> +	/* Setup some default values */
> +	dev->dev_ops = dev_ops;
> +	data->dev_private = priv;

> +	data->port_id = dev->data->port_id;
> +	memmove(data->name, dev->data->name, strlen(dev->data->name));
These two assignments are useless, needs to be done before "dev->data =
data" assignment.

> +
> +	dev->driver = NULL;
> +	data->dev_flags = RTE_ETH_DEV_DETACHABLE;
> +	data->kdrv = RTE_KDRV_NONE;
> +	data->numa_node = numa_node;
> +
> +	*eth_dev = dev;
> +	*internals = priv;
> +
> +	return 0;
> +error:
> +	rte_free(priv);
> +
> +	if (flag & RTE_USE_PRIVATE_DATA)
> +		rte_free(data);
> +
> +	rte_eth_dev_release_port(dev);
> +
> +	return -1;
> +}
> +
> +static int
> +pmd_init_internals(const char *name, struct tap_info *tap,
> +		   struct pmd_internals **internals,
> +		   struct rte_eth_dev **eth_dev)
> +{
> +	struct rte_eth_dev *dev = NULL;
> +	struct pmd_internals *internal = NULL;
> +	struct rte_eth_dev_data *data = NULL;
> +	int ret, i, fd = -1;
> +
> +	RTE_LOG(INFO, PMD,
> +		"%s: Create TUN/TAP Ethernet device with %d queues on numa %u\n",
> +		name, RTE_PMD_TAP_MAX_QUEUES, rte_socket_id());
> +
> +	pmd_link.link_speed = tap->speed;
> +
> +	ret = rte_eth_dev_create(tap->name, &dev, &ops,
> +				 (void **)&internal, sizeof(struct pmd_internals),
Why rte_eth_dev_create() get "void **internals" which requires casting,
but not "struct pmd_internals **internals" ?

> +				 RTE_USE_PRIVATE_DATA);
> +	if (ret < 0)
> +		return -1;
> +
> +	strncpy(internal->name, tap->name, sizeof(internal->name));
> +
> +	internal->nb_queues = RTE_PMD_TAP_MAX_QUEUES;
> +
> +	/* Create the first Tap device */
> +	if ((fd = tun_alloc(dev->data->name)) < 0) {
> +		RTE_LOG(ERR, PMD, "tun_alloc(%s) failed\n", dev->data->name);
> +		rte_free(internal);
rte_free(dev->data); ?
But needs to check RTE_USE_PRIVATE_DATA ..

> +		rte_eth_dev_release_port(dev);
> +		return -1;
> +	}
> +
> +	/* Presetup the fds to -1 as being not working */
> +	for(i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
> +		internal->fds[i] = -1;
> +		internal->rxq[i].fd = -1;
> +		internal->txq[i].fd = -1;
> +	}
> +
> +	/* Take the TUN/TAP fd and place in the first location */
> +	internal->rxq[0].fd = fd;
> +	internal->txq[0].fd = fd;
> +	internal->fds[0] = fd;
> +
> +	if (pmd_mac_address(fd, dev, &internal->eth_addr) < 0) {
> +		rte_free(internal);
rte_free(dev->data); ?

> +		rte_eth_dev_release_port(dev);
> +		return -1;
> +	}
> +
> +	data = dev->data;
> +
> +	data->dev_link = pmd_link;
> +	data->mac_addrs = &internal->eth_addr;
> +
> +	data->nb_rx_queues = (uint16_t)internal->nb_queues;
> +	data->nb_tx_queues = (uint16_t)internal->nb_queues;
no cast required.

> +	data->drv_name = drivername;
> +
> +	*eth_dev = dev;
> +	*internals = internal;
> +
> +	return 0;
> +}
> +

<...>

> +
> +static int
> +set_interface_speed(const char *key __rte_unused,
> +		    const char *value,
> +		    void *extra_args __rte_unused)
need to drop  __rte_unused for extra_args

> +{
> +	struct tap_info *tap = (struct tap_info *)extra_args;
> +
> +	pmd_link.link_speed = (value) ? atoi(value) : ETH_SPEED_NUM_10G;
> +	tap->speed = pmd_link.link_speed;
> +
> +	return 0;
> +}
> +
> +/*
> + * Open a TAP interface device.
> + */
> +static int
> +rte_pmd_tap_devinit(const char *name, const char *params)
> +{
> +	int ret = 0;
> +	struct rte_kvargs *kvlist;
> +	struct tap_info tap_info;
> +
> +	/* Setup default values */
> +	memset(&tap_info, 0, sizeof(tap_info));
> +
> +	tap_info.speed = ETH_SPEED_NUM_10G;
> +	snprintf(tap_info.name, sizeof(tap_info.name), "dtap%d", tap_unit++);
What about extracting iface name "dtap" into a macro to make it more
visible.

> +
> +	if ((params == NULL) || (params[0] == '\0')) {
> +		RTE_LOG(INFO, PMD, "Initializing pmd_tap for %s\n", name);
> +
> +		ret = eth_dev_tap_create(name, &tap_info);
This "name" is not used at all (except from RTE_LOG), instead tap->name
is used for interface name, so why carying this variable around?

> +		goto leave;
> +	}
> +
> +	RTE_LOG(INFO, PMD, "Initialize %s with params (%s)\n", name, params);
> +
> +	kvlist = rte_kvargs_parse(params, valid_arguments);
> +	if (!kvlist) {
> +		ret = eth_dev_tap_create(name, &tap_info);
> +		goto leave;
> +	}
> +
> +	if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) {
> +		ret = rte_kvargs_process(kvlist, ETH_TAP_SPEED_ARG,
> +					 &set_interface_speed, &tap_info);
> +		if (ret < 0)
> +			goto leave;
> +	} else
> +		set_interface_speed(NULL, NULL, &tap_info);
This call is redundant, tap_info already has default speed value set.

> +
> +	if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
> +		ret = rte_kvargs_process(kvlist, ETH_TAP_IFACE_ARG,
> +					 &set_interface_name, &tap_info);
> +		if (ret < 0)
> +			goto leave;
> +	} else
> +		set_interface_name(NULL, NULL, (void *)&tap_info);
tap_info->name already set to default value (dtap%d), this call is not
required.

> +
> +	rte_kvargs_free(kvlist);
> +
> +leave:
> +	if (ret == -1)
> +		RTE_LOG(INFO, PMD, "Failed to create pmd_tap for %s\n", name);
> +
> +	return ret;
> +}
> +
> +/*
> + * detach a TAP device.
> + */
> +static int
> +rte_pmd_tap_devuninit(const char *name)
> +{
> +	struct rte_eth_dev *eth_dev = NULL;
> +	struct pmd_internals *internals;
> +	int i;
> +
> +	RTE_LOG(INFO, PMD, "Closing TUN/TAP Ethernet device on numa %u\n",
> +		rte_socket_id());
> +
> +	if (name == NULL)
This check is redundant, eal layer won't call this function with "name
== NULL"

> +		return 0;
> +
> +	/* find the ethdev entry */
> +	eth_dev = rte_eth_dev_allocated(name);
> +	if (eth_dev == NULL)
> +		return 0;
> +
> +	internals = eth_dev->data->dev_private;
> +	for (i = 0; i < internals->nb_queues; i++)
> +		if (internals->fds[i] != -1)
> +			close(internals->fds[i]);
> +
> +	rte_free(eth_dev->data->dev_private);
> +	rte_free(eth_dev->data);
data can be shared?
Don't we need a RTE_USE_PRIVATE_DATA flag check?

> +
> +	rte_eth_dev_release_port(eth_dev);
> +
> +	return 0;
> +}
> +
> +static struct rte_vdev_driver pmd_tap_drv = {
> +	.init = rte_pmd_tap_devinit,
> +	.uninit = rte_pmd_tap_devuninit,
> +};
> +
> +DRIVER_REGISTER_VDEV(eth_tap, pmd_tap_drv);
name convention is now: "net_tap"

> +DRIVER_REGISTER_PARAM_STRING(eth_tap,
> +			     "iface=<string>,speed=N");
> diff --git a/drivers/net/tap/rte_pmd_tap_version.map b/drivers/net/tap/rte_pmd_tap_version.map
> new file mode 100644
> index 0000000..61463bf
> --- /dev/null
> +++ b/drivers/net/tap/rte_pmd_tap_version.map
> @@ -0,0 +1,4 @@
> +DPDK_16.11 {
> +
> +	local: *;
> +};
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 1a0095b..bd1d10f 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -129,6 +129,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST)      += -lrte_pmd_vhost
>  endif # $(CONFIG_RTE_LIBRTE_VHOST)
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD)    += -lrte_pmd_vmxnet3_uio
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_TAP)        += -lrte_pmd_tap
please put in alphebetical order

>  
>  ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB)   += -lrte_pmd_aesni_mb
> 



More information about the dev mailing list