[dpdk-dev,v2,1/2] eal: add uevent api for hot plug

Message ID 1498648044-57541-1-git-send-email-jia.guo@intel.com (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/Intel-compilation fail Compilation issues
ci/checkpatch warning coding style issues

Commit Message

Guo, Jia June 28, 2017, 11:07 a.m. UTC
  From: "Guo, Jia" <jia.guo@intel.com>

This patch aim to add a variable "uevent_fd" in structure
"rte_intr_handle" for enable kernel object uevent monitoring,
and add some uevent API in rte eal interrupt, that is
“rte_uevent_connect” and “rte_uevent_get”, so that all driver
could use these API to monitor and read out the uevent, then
corresponding to handle these uevent, such as detach or attach
the device.

Signed-off-by: Guo, Jia <jia.guo@intel.com>
---
v2->v1: remove global variables of hotplug_fd, add uevent_fd
        in rte_intr_handle to let each pci device self maintain
	to fix dual device fd issue. refine some typo error. 	 
---
 lib/librte_eal/common/eal_common_pci_uio.c         |   6 +-
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 143 ++++++++++++++++++++-
 lib/librte_eal/linuxapp/eal/eal_pci_uio.c          |  12 ++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |  34 +++++
 4 files changed, 192 insertions(+), 3 deletions(-)
  

Comments

Jingjing Wu June 29, 2017, 2:25 a.m. UTC | #1
> +

> +int

> +rte_uevent_connect(void)

> +{

> +	struct sockaddr_nl addr;

> +	int ret;

> +	int netlink_fd = -1;

> +	int size = 64 * 1024;

> +	int nonblock = 1;

> +	memset(&addr, 0, sizeof(addr));

> +	addr.nl_family = AF_NETLINK;

> +	addr.nl_pid = 0;

> +	addr.nl_groups = 0xffffffff;

> +

> +	netlink_fd = socket(PF_NETLINK, SOCK_DGRAM,

> NETLINK_KOBJECT_UEVENT);

> +	if (netlink_fd < 0)

> +		return -1;

> +

> +	RTE_LOG(ERR, EAL,

> +	"netlink_fd is %d\n", netlink_fd);

Is this a ERR log?

> +

> +	setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size,

> +sizeof(size));

> +

> +	ret = ioctl(netlink_fd, FIONBIO, &nonblock);

> +	if (ret != 0) {

> +		RTE_LOG(ERR, EAL,

> +		"ioctl(FIONBIO) failed\n");

> +		close(netlink_fd);

> +		return -1;

> +	}

> +

> +	if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {

> +		close(netlink_fd);

> +		return -1;

> +	}

> +

> +	return netlink_fd;

> +}

> +

> +static int

> +parse_event(const char *buf, struct rte_uevent *event) {

> +	char action[RTE_UEVENT_MSG_LEN];

> +	char subsystem[RTE_UEVENT_MSG_LEN];

> +	char dev_path[RTE_UEVENT_MSG_LEN];

> +	int i = 0;

> +

> +	memset(action, 0, RTE_UEVENT_MSG_LEN);

> +	memset(subsystem, 0, RTE_UEVENT_MSG_LEN);

> +	memset(dev_path, 0, RTE_UEVENT_MSG_LEN);

> +

> +	while (*buf && i < RTE_UEVENT_MSG_LEN) {

> +		if (!strncmp(buf, "ACTION=", 7)) {

> +			buf += 7;

> +			snprintf(action, sizeof(action), "%s", buf);

> +		} else if (!strncmp(buf, "DEVPATH=", 8)) {

> +			buf += 8;

> +			snprintf(dev_path, sizeof(dev_path), "%s", buf);

> +		} else if (!strncmp(buf, "SUBSYSTEM=", 10)) {

> +			buf += 10;

> +			snprintf(subsystem, sizeof(subsystem), "%s", buf);

> +		}

> +		while (*buf++)

> +			i++;

> +		while (*buf == '\0') {

> +			buf++;

> +			i++;

> +		}

++ until to the end? The logic looks wrong. Please check carefully.

> +	}

> +

> +	if (!strncmp(subsystem, "uio", 3)) {

> +

> +		event->subsystem = RTE_UEVENT_SUBSYSTEM_UIO;

> +		if (!strncmp(action, "add", 3))

> +			event->action = RTE_UEVENT_ADD;

> +		if (!strncmp(action, "remove", 6))

> +			event->action = RTE_UEVENT_REMOVE;

> +		return 0;

> +	}

> +

> +	return -1;

> +}

> +

> +int

> +rte_uevent_get(int fd, struct rte_uevent *uevent) {

> +	int ret;

> +	char buf[RTE_UEVENT_MSG_LEN];

> +

> +	memset(uevent, 0, sizeof(struct rte_uevent));

> +	memset(buf, 0, RTE_UEVENT_MSG_LEN);

> +

> +	ret = recv(fd, buf, RTE_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);

> +	if (ret > 0)

> +		return parse_event(buf, uevent);

> +

> +	if (ret < 0) {

Meaningless check.
> +		if (errno == EAGAIN || errno == EWOULDBLOCK) {

> +			return 0;

Return -1? The function is declared as 0 means success.

> +		} else {

> +			RTE_LOG(ERR, EAL,

> +			"Socket read error(%d): %s\n",

> +			errno, strerror(errno));

Why not return?

> +		}

> +	}

> +

> +	/* connection closed */

> +	if (ret == 0)

> +		return -1;

> +

> +	return 0;

> +}

> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c

> b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c

> index fa10329..2fead82 100644

> --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c

> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c

> @@ -231,6 +231,10 @@

>  		close(dev->intr_handle.uio_cfg_fd);

>  		dev->intr_handle.uio_cfg_fd = -1;

>  	}

> +	if (dev->intr_handle.uevent_fd >= 0) {

> +		close(dev->intr_handle.uevent_fd);

> +		dev->intr_handle.uevent_fd = -1;

> +	}

>  	if (dev->intr_handle.fd >= 0) {

>  		close(dev->intr_handle.fd);

>  		dev->intr_handle.fd = -1;

> @@ -245,6 +249,7 @@

>  	char dirname[PATH_MAX];

>  	char cfgname[PATH_MAX];

>  	char devname[PATH_MAX]; /* contains the /dev/uioX */

> +	char uevtname[PATH_MAX];

>  	int uio_num;

>  	struct rte_pci_addr *loc;

> 

> @@ -276,6 +281,13 @@

>  		goto error;

>  	}

> 

> +	dev->intr_handle.uevent_fd = rte_uevent_connect();

> +	if (dev->intr_handle.uevent_fd < 0) {

> +		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",

> +			uevtname, strerror(errno));

> +		goto error;

It seems uevtname is not assigned at all. Do we need it? And the log may means
"cannot connect the event fd", right?. And even the event fd is failed to create,
should it block the process? 


Thanks
Jingjing
  
Guo, Jia June 29, 2017, 4:29 a.m. UTC | #2
The buf have contain lot of consistent '/0', so it is why I need to check that. anyway I will refine that and other return issue in v3.

Thanks jingjing .

Best regards,
Jeff Guo


-----Original Message-----
From: Wu, Jingjing 

Sent: Thursday, June 29, 2017 10:25 AM
To: Guo, Jia <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>
Cc: dev@dpdk.org
Subject: RE: [PATCH v2 1/2] eal: add uevent api for hot plug

> +

> +int

> +rte_uevent_connect(void)

> +{

> +	struct sockaddr_nl addr;

> +	int ret;

> +	int netlink_fd = -1;

> +	int size = 64 * 1024;

> +	int nonblock = 1;

> +	memset(&addr, 0, sizeof(addr));

> +	addr.nl_family = AF_NETLINK;

> +	addr.nl_pid = 0;

> +	addr.nl_groups = 0xffffffff;

> +

> +	netlink_fd = socket(PF_NETLINK, SOCK_DGRAM,

> NETLINK_KOBJECT_UEVENT);

> +	if (netlink_fd < 0)

> +		return -1;

> +

> +	RTE_LOG(ERR, EAL,

> +	"netlink_fd is %d\n", netlink_fd);

Is this a ERR log?

> +

> +	setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size, 

> +sizeof(size));

> +

> +	ret = ioctl(netlink_fd, FIONBIO, &nonblock);

> +	if (ret != 0) {

> +		RTE_LOG(ERR, EAL,

> +		"ioctl(FIONBIO) failed\n");

> +		close(netlink_fd);

> +		return -1;

> +	}

> +

> +	if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {

> +		close(netlink_fd);

> +		return -1;

> +	}

> +

> +	return netlink_fd;

> +}

> +

> +static int

> +parse_event(const char *buf, struct rte_uevent *event) {

> +	char action[RTE_UEVENT_MSG_LEN];

> +	char subsystem[RTE_UEVENT_MSG_LEN];

> +	char dev_path[RTE_UEVENT_MSG_LEN];

> +	int i = 0;

> +

> +	memset(action, 0, RTE_UEVENT_MSG_LEN);

> +	memset(subsystem, 0, RTE_UEVENT_MSG_LEN);

> +	memset(dev_path, 0, RTE_UEVENT_MSG_LEN);

> +

> +	while (*buf && i < RTE_UEVENT_MSG_LEN) {

> +		if (!strncmp(buf, "ACTION=", 7)) {

> +			buf += 7;

> +			snprintf(action, sizeof(action), "%s", buf);

> +		} else if (!strncmp(buf, "DEVPATH=", 8)) {

> +			buf += 8;

> +			snprintf(dev_path, sizeof(dev_path), "%s", buf);

> +		} else if (!strncmp(buf, "SUBSYSTEM=", 10)) {

> +			buf += 10;

> +			snprintf(subsystem, sizeof(subsystem), "%s", buf);

> +		}

> +		while (*buf++)

> +			i++;

> +		while (*buf == '\0') {

> +			buf++;

> +			i++;

> +		}

++ until to the end? The logic looks wrong. Please check carefully.

> +	}

> +

> +	if (!strncmp(subsystem, "uio", 3)) {

> +

> +		event->subsystem = RTE_UEVENT_SUBSYSTEM_UIO;

> +		if (!strncmp(action, "add", 3))

> +			event->action = RTE_UEVENT_ADD;

> +		if (!strncmp(action, "remove", 6))

> +			event->action = RTE_UEVENT_REMOVE;

> +		return 0;

> +	}

> +

> +	return -1;

> +}

> +

> +int

> +rte_uevent_get(int fd, struct rte_uevent *uevent) {

> +	int ret;

> +	char buf[RTE_UEVENT_MSG_LEN];

> +

> +	memset(uevent, 0, sizeof(struct rte_uevent));

> +	memset(buf, 0, RTE_UEVENT_MSG_LEN);

> +

> +	ret = recv(fd, buf, RTE_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);

> +	if (ret > 0)

> +		return parse_event(buf, uevent);

> +

> +	if (ret < 0) {

Meaningless check.
> +		if (errno == EAGAIN || errno == EWOULDBLOCK) {

> +			return 0;

Return -1? The function is declared as 0 means success.

> +		} else {

> +			RTE_LOG(ERR, EAL,

> +			"Socket read error(%d): %s\n",

> +			errno, strerror(errno));

Why not return?

> +		}

> +	}

> +

> +	/* connection closed */

> +	if (ret == 0)

> +		return -1;

> +

> +	return 0;

> +}

> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c

> b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c

> index fa10329..2fead82 100644

> --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c

> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c

> @@ -231,6 +231,10 @@

>  		close(dev->intr_handle.uio_cfg_fd);

>  		dev->intr_handle.uio_cfg_fd = -1;

>  	}

> +	if (dev->intr_handle.uevent_fd >= 0) {

> +		close(dev->intr_handle.uevent_fd);

> +		dev->intr_handle.uevent_fd = -1;

> +	}

>  	if (dev->intr_handle.fd >= 0) {

>  		close(dev->intr_handle.fd);

>  		dev->intr_handle.fd = -1;

> @@ -245,6 +249,7 @@

>  	char dirname[PATH_MAX];

>  	char cfgname[PATH_MAX];

>  	char devname[PATH_MAX]; /* contains the /dev/uioX */

> +	char uevtname[PATH_MAX];

>  	int uio_num;

>  	struct rte_pci_addr *loc;

> 

> @@ -276,6 +281,13 @@

>  		goto error;

>  	}

> 

> +	dev->intr_handle.uevent_fd = rte_uevent_connect();

> +	if (dev->intr_handle.uevent_fd < 0) {

> +		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",

> +			uevtname, strerror(errno));

> +		goto error;

It seems uevtname is not assigned at all. Do we need it? And the log may means "cannot connect the event fd", right?. And even the event fd is failed to create, should it block the process? 


Thanks
Jingjing
  
Thomas Monjalon July 4, 2017, 11:45 p.m. UTC | #3
Hi,

This is an interesting step for hotplug in DPDK.

28/06/2017 13:07, Jeff Guo:
> +       netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);

It is monitoring the whole system...

> +int
> +rte_uevent_get(int fd, struct rte_uevent *uevent)
> +{
> +       int ret;
> +       char buf[RTE_UEVENT_MSG_LEN];
> +
> +       memset(uevent, 0, sizeof(struct rte_uevent));
> +       memset(buf, 0, RTE_UEVENT_MSG_LEN);
> +
> +       ret = recv(fd, buf, RTE_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);

... and it is read from this function called by one driver.
It cannot work without a global dispatch.

It must be a global mechanism, probably a service core.
The question is also to know whether it should be a mandatory
service in DPDK or an optional helper?
  
Guo, Jia July 5, 2017, 3:02 a.m. UTC | #4
hi, thomas


On 7/5/2017 7:45 AM, Thomas Monjalon wrote:
> Hi,
>
> This is an interesting step for hotplug in DPDK.
>
> 28/06/2017 13:07, Jeff Guo:
>> +       netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
> It is monitoring the whole system...
>
>> +int
>> +rte_uevent_get(int fd, struct rte_uevent *uevent)
>> +{
>> +       int ret;
>> +       char buf[RTE_UEVENT_MSG_LEN];
>> +
>> +       memset(uevent, 0, sizeof(struct rte_uevent));
>> +       memset(buf, 0, RTE_UEVENT_MSG_LEN);
>> +
>> +       ret = recv(fd, buf, RTE_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
> ... and it is read from this function called by one driver.
> It cannot work without a global dispatch.
the rte_uevent-connect is called in func of pci_uio_alloc_resource, so 
each socket is created by  by each uio device. so i think that would not 
affect each driver isolate to use it.
> It must be a global mechanism, probably a service core.
> The question is also to know whether it should be a mandatory
> service in DPDK or an optional helper?
a global mechanism would be good, but so far, include mlx driver, we all 
handle the hot plug event in driver by app's registered callback. maybe 
a better global would be try in the future. but now is would work for all
pci uio device.
and more, if in pci uio device to use hot plug , i think it might be 
mandatory.
  
Thomas Monjalon July 5, 2017, 7:32 a.m. UTC | #5
05/07/2017 05:02, Guo, Jia:
> hi, thomas
> 
> 
> On 7/5/2017 7:45 AM, Thomas Monjalon wrote:
> > Hi,
> >
> > This is an interesting step for hotplug in DPDK.
> >
> > 28/06/2017 13:07, Jeff Guo:
> >> +       netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
> > It is monitoring the whole system...
> >
> >> +int
> >> +rte_uevent_get(int fd, struct rte_uevent *uevent)
> >> +{
> >> +       int ret;
> >> +       char buf[RTE_UEVENT_MSG_LEN];
> >> +
> >> +       memset(uevent, 0, sizeof(struct rte_uevent));
> >> +       memset(buf, 0, RTE_UEVENT_MSG_LEN);
> >> +
> >> +       ret = recv(fd, buf, RTE_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
> > ... and it is read from this function called by one driver.
> > It cannot work without a global dispatch.
> the rte_uevent-connect is called in func of pci_uio_alloc_resource, so 
> each socket is created by  by each uio device. so i think that would not 
> affect each driver isolate to use it.

Ah OK, I missed it.

> > It must be a global mechanism, probably a service core.
> > The question is also to know whether it should be a mandatory
> > service in DPDK or an optional helper?
> a global mechanism would be good, but so far, include mlx driver, we all 
> handle the hot plug event in driver by app's registered callback. maybe 
> a better global would be try in the future. but now is would work for all
> pci uio device.

mlx drivers have a special connection to the kernel through the associated
mlx kernel drivers. That's why the PMD handle the events in a specific way.

You are adding event handling for UIO.
Now we need also VFIO.

I am wondering how it could be better integrated in the bus layer.

> and more, if in pci uio device to use hot plug , i think it might be 
> mandatory.
  
Guo, Jia July 5, 2017, 9:04 a.m. UTC | #6
On 7/5/2017 3:32 PM, Thomas Monjalon wrote:
> 05/07/2017 05:02, Guo, Jia:
>> hi, thomas
>>
>>
>> On 7/5/2017 7:45 AM, Thomas Monjalon wrote:
>>> Hi,
>>>
>>> This is an interesting step for hotplug in DPDK.
>>>
>>> 28/06/2017 13:07, Jeff Guo:
>>>> +       netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
>>> It is monitoring the whole system...
>>>
>>>> +int
>>>> +rte_uevent_get(int fd, struct rte_uevent *uevent)
>>>> +{
>>>> +       int ret;
>>>> +       char buf[RTE_UEVENT_MSG_LEN];
>>>> +
>>>> +       memset(uevent, 0, sizeof(struct rte_uevent));
>>>> +       memset(buf, 0, RTE_UEVENT_MSG_LEN);
>>>> +
>>>> +       ret = recv(fd, buf, RTE_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
>>> ... and it is read from this function called by one driver.
>>> It cannot work without a global dispatch.
>> the rte_uevent-connect is called in func of pci_uio_alloc_resource, so
>> each socket is created by  by each uio device. so i think that would not
>> affect each driver isolate to use it.
> Ah OK, I missed it.
>
>>> It must be a global mechanism, probably a service core.
>>> The question is also to know whether it should be a mandatory
>>> service in DPDK or an optional helper?
>> a global mechanism would be good, but so far, include mlx driver, we all
>> handle the hot plug event in driver by app's registered callback. maybe
>> a better global would be try in the future. but now is would work for all
>> pci uio device.
> mlx drivers have a special connection to the kernel through the associated
> mlx kernel drivers. That's why the PMD handle the events in a specific way.
>
> You are adding event handling for UIO.
> Now we need also VFIO.
>
> I am wondering how it could be better integrated in the bus layer.
absolutely, hot plug for VFIO must be request more for the live 
migration,  and we plan to add it at next level, when we go thought all 
uio hot plug feature integration done. so, could i expect an ack if 
there aren't other concern about uio uevent here. thanks.
>> and more, if in pci uio device to use hot plug , i think it might be
>> mandatory.
  
Guo, Jia Aug. 22, 2017, 2:56 p.m. UTC | #7
a. about uevent mechanism
 
As we know, uevent is come from the kobject of the kernel side, every kobject would have its own uevent, and a sysfs folder identify a kobject,
such as cpu,usb,pci,pci-express,virio, these bus component all have uevent. I agree that uevent would be the best if it could integrated in the bus layer.
I check the kernel src code , the uevent related is in lib/koject_uvent.c, and only for linux not for bsp, both support uio and vfio, 
so where shoud dpdk uevent be location?  I come to my mind 4 option below, and I propose 2) and 4).

1)Eal_bus:  (but uevent like netlink socket thing and event polling not related with bus behavior)
2)eal_dev:  (just considerate it like kernel's udev, and create new epoll, uevent handler)
3)add new file eal_udev.c
4)eal_interrupt. (add into the interrupt epoll, use interrupt handler)

Shreyansh & jblunck & gaetan

Since you recently work on pci bus layer and expert on  that, I want to ask you that if you plan about any other bus layer rework would be conflict my proposer,
or would let me modify to compatibility with the next architect? If you have,  please let me know. thanks. 

b. about pci uevent handler.
I suppose 2 option:
1)use a common interrupt handler for pci pmd to let app or fail-safe pmd to register. 
2)use a common uevent handler for pci pmd to let app or fail-safe pmd register.

Community, are there any good comment about that ?

Best regards,
Jeff Guo

-----Original Message-----
From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Guo, Jia
Sent: Wednesday, July 5, 2017 5:04 PM
To: Thomas Monjalon <thomas@monjalon.net>
Cc: dev@dpdk.org; Zhang, Helin <helin.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
Subject: Re: [dpdk-dev] [PATCH v2 1/2] eal: add uevent api for hot plug



On 7/5/2017 3:32 PM, Thomas Monjalon wrote:
> 05/07/2017 05:02, Guo, Jia:
>> hi, thomas
>>
>>
>> On 7/5/2017 7:45 AM, Thomas Monjalon wrote:
>>> Hi,
>>>
>>> This is an interesting step for hotplug in DPDK.
>>>
>>> 28/06/2017 13:07, Jeff Guo:
>>>> +       netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, 
>>>> + NETLINK_KOBJECT_UEVENT);
>>> It is monitoring the whole system...
>>>
>>>> +int
>>>> +rte_uevent_get(int fd, struct rte_uevent *uevent) {
>>>> +       int ret;
>>>> +       char buf[RTE_UEVENT_MSG_LEN];
>>>> +
>>>> +       memset(uevent, 0, sizeof(struct rte_uevent));
>>>> +       memset(buf, 0, RTE_UEVENT_MSG_LEN);
>>>> +
>>>> +       ret = recv(fd, buf, RTE_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
>>> ... and it is read from this function called by one driver.
>>> It cannot work without a global dispatch.
>> the rte_uevent-connect is called in func of pci_uio_alloc_resource, 
>> so each socket is created by  by each uio device. so i think that 
>> would not affect each driver isolate to use it.
> Ah OK, I missed it.
>
>>> It must be a global mechanism, probably a service core.
>>> The question is also to know whether it should be a mandatory 
>>> service in DPDK or an optional helper?
>> a global mechanism would be good, but so far, include mlx driver, we 
>> all handle the hot plug event in driver by app's registered callback. 
>> maybe a better global would be try in the future. but now is would 
>> work for all pci uio device.
> mlx drivers have a special connection to the kernel through the 
> associated mlx kernel drivers. That's why the PMD handle the events in a specific way.
>
> You are adding event handling for UIO.
> Now we need also VFIO.
>
> I am wondering how it could be better integrated in the bus layer.
absolutely, hot plug for VFIO must be request more for the live migration,  and we plan to add it at next level, when we go thought all uio hot plug feature integration done. so, could i expect an ack if there aren't other concern about uio uevent here. thanks.
>> and more, if in pci uio device to use hot plug , i think it might be 
>> mandatory.
  
Gaëtan Rivet Aug. 28, 2017, 3:50 p.m. UTC | #8
Hi Jeff,

On Tue, Aug 22, 2017 at 02:56:04PM +0000, Guo, Jia wrote:
> a. about uevent mechanism
>  
> As we know, uevent is come from the kobject of the kernel side, every kobject would have its own uevent, and a sysfs folder identify a kobject,
> such as cpu,usb,pci,pci-express,virio, these bus component all have uevent. I agree that uevent would be the best if it could integrated in the bus layer.
> I check the kernel src code , the uevent related is in lib/koject_uvent.c, and only for linux not for bsp, both support uio and vfio, 
> so where shoud dpdk uevent be location?  I come to my mind 4 option below, and I propose 2) and 4).
> 
> 1)Eal_bus:  (but uevent like netlink socket thing and event polling not related with bus behavior)
> 2)eal_dev:  (just considerate it like kernel's udev, and create new epoll, uevent handler)
> 3)add new file eal_udev.c
> 4)eal_interrupt. (add into the interrupt epoll, use interrupt handler)
> 
> Shreyansh & jblunck & gaetan
> 
> Since you recently work on pci bus layer and expert on  that, I want to ask you that if you plan about any other bus layer rework would be conflict my proposer,
> or would let me modify to compatibility with the next architect? If you have,  please let me know. thanks. 
> 

Yes, some work is planned at least from my side.

I am moving the PCI bus out of the EAL:
http://dpdk.org/ml/archives/dev/2017-August/073512.html

The current proposal is not yet complete. Some filesystem functions
might need to be exposed.

However, it has not functional impact: functions may move, but they do
the same thing. But even so, the uevent_fd that you add within
eal_common_uio will probably have to be moved (eal_common_pci_uio.c is
going out of the EAL), nothing dramatic.

That's all I can think of from my PoV that might be in conflict with
your work.

> b. about pci uevent handler.
> I suppose 2 option:
> 1)use a common interrupt handler for pci pmd to let app or fail-safe pmd to register. 
> 2)use a common uevent handler for pci pmd to let app or fail-safe pmd register.
> 
> Community, are there any good comment about that ?
> 
> Best regards,
> Jeff Guo
> 
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Guo, Jia
> Sent: Wednesday, July 5, 2017 5:04 PM
> To: Thomas Monjalon <thomas@monjalon.net>
> Cc: dev@dpdk.org; Zhang, Helin <helin.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v2 1/2] eal: add uevent api for hot plug
> 
> 
> 
> On 7/5/2017 3:32 PM, Thomas Monjalon wrote:
> > 05/07/2017 05:02, Guo, Jia:
> >> hi, thomas
> >>
> >>
> >> On 7/5/2017 7:45 AM, Thomas Monjalon wrote:
> >>> Hi,
> >>>
> >>> This is an interesting step for hotplug in DPDK.
> >>>
> >>> 28/06/2017 13:07, Jeff Guo:
> >>>> +       netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, 
> >>>> + NETLINK_KOBJECT_UEVENT);
> >>> It is monitoring the whole system...
> >>>
> >>>> +int
> >>>> +rte_uevent_get(int fd, struct rte_uevent *uevent) {
> >>>> +       int ret;
> >>>> +       char buf[RTE_UEVENT_MSG_LEN];
> >>>> +
> >>>> +       memset(uevent, 0, sizeof(struct rte_uevent));
> >>>> +       memset(buf, 0, RTE_UEVENT_MSG_LEN);
> >>>> +
> >>>> +       ret = recv(fd, buf, RTE_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
> >>> ... and it is read from this function called by one driver.
> >>> It cannot work without a global dispatch.
> >> the rte_uevent-connect is called in func of pci_uio_alloc_resource, 
> >> so each socket is created by  by each uio device. so i think that 
> >> would not affect each driver isolate to use it.
> > Ah OK, I missed it.
> >
> >>> It must be a global mechanism, probably a service core.
> >>> The question is also to know whether it should be a mandatory 
> >>> service in DPDK or an optional helper?
> >> a global mechanism would be good, but so far, include mlx driver, we 
> >> all handle the hot plug event in driver by app's registered callback. 
> >> maybe a better global would be try in the future. but now is would 
> >> work for all pci uio device.
> > mlx drivers have a special connection to the kernel through the 
> > associated mlx kernel drivers. That's why the PMD handle the events in a specific way.
> >
> > You are adding event handling for UIO.
> > Now we need also VFIO.
> >
> > I am wondering how it could be better integrated in the bus layer.
> absolutely, hot plug for VFIO must be request more for the live migration,  and we plan to add it at next level, when we go thought all uio hot plug feature integration done. so, could i expect an ack if there aren't other concern about uio uevent here. thanks.
> >> and more, if in pci uio device to use hot plug , i think it might be 
> >> mandatory.
>
  

Patch

diff --git a/lib/librte_eal/common/eal_common_pci_uio.c b/lib/librte_eal/common/eal_common_pci_uio.c
index 367a681..5b62f70 100644
--- a/lib/librte_eal/common/eal_common_pci_uio.c
+++ b/lib/librte_eal/common/eal_common_pci_uio.c
@@ -117,6 +117,7 @@ 
 
 	dev->intr_handle.fd = -1;
 	dev->intr_handle.uio_cfg_fd = -1;
+	dev->intr_handle.uevent_fd = -1;
 	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
 
 	/* secondary processes - use already recorded details */
@@ -227,7 +228,10 @@ 
 		close(dev->intr_handle.uio_cfg_fd);
 		dev->intr_handle.uio_cfg_fd = -1;
 	}
-
+	if (dev->intr_handle.uevent_fd >= 0) {
+		close(dev->intr_handle.uevent_fd);
+		dev->intr_handle.uevent_fd = -1;
+	}
 	dev->intr_handle.fd = -1;
 	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 2e3bd12..d596522 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -65,6 +65,10 @@ 
 #include <rte_errno.h>
 #include <rte_spinlock.h>
 
+#include <sys/socket.h>
+#include <linux/netlink.h>
+#include <sys/epoll.h>
+
 #include "eal_private.h"
 #include "eal_vfio.h"
 #include "eal_thread.h"
@@ -74,6 +78,9 @@ 
 
 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
 
+#define RTE_UEVENT_MSG_LEN 4096
+#define RTE_UEVENT_SUBSYSTEM_UIO 1
+
 /**
  * union for pipe fds.
  */
@@ -669,10 +676,13 @@  struct rte_intr_source {
 			RTE_SET_USED(r);
 			return -1;
 		}
+
 		rte_spinlock_lock(&intr_lock);
 		TAILQ_FOREACH(src, &intr_sources, next)
-			if (src->intr_handle.fd ==
-					events[n].data.fd)
+			if ((src->intr_handle.fd ==
+					events[n].data.fd) ||
+				(src->intr_handle.uevent_fd ==
+					events[n].data.fd))
 				break;
 		if (src == NULL){
 			rte_spinlock_unlock(&intr_lock);
@@ -858,7 +868,24 @@  static __attribute__((noreturn)) void *
 			}
 			else
 				numfds++;
+
+			/**
+			 * add device uevent file descriptor
+			 * into wait list for uevent monitoring.
+			 */
+			ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
+			ev.data.fd = src->intr_handle.uevent_fd;
+			if (epoll_ctl(pfd, EPOLL_CTL_ADD,
+					src->intr_handle.uevent_fd, &ev) < 0){
+				rte_panic("Error adding uevent_fd %d epoll_ctl"
+					", %s\n",
+					src->intr_handle.uevent_fd,
+					strerror(errno));
+			} else
+				numfds++;
 		}
+
+
 		rte_spinlock_unlock(&intr_lock);
 		/* serve the interrupt */
 		eal_intr_handle_interrupts(pfd, numfds);
@@ -1255,3 +1282,115 @@  static __attribute__((noreturn)) void *
 
 	return 0;
 }
+
+int
+rte_uevent_connect(void)
+{
+	struct sockaddr_nl addr;
+	int ret;
+	int netlink_fd = -1;
+	int size = 64 * 1024;
+	int nonblock = 1;
+	memset(&addr, 0, sizeof(addr));
+	addr.nl_family = AF_NETLINK;
+	addr.nl_pid = 0;
+	addr.nl_groups = 0xffffffff;
+
+	netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
+	if (netlink_fd < 0)
+		return -1;
+
+	RTE_LOG(ERR, EAL,
+	"netlink_fd is %d\n", netlink_fd);
+
+	setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size, sizeof(size));
+
+	ret = ioctl(netlink_fd, FIONBIO, &nonblock);
+	if (ret != 0) {
+		RTE_LOG(ERR, EAL,
+		"ioctl(FIONBIO) failed\n");
+		close(netlink_fd);
+		return -1;
+	}
+
+	if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
+		close(netlink_fd);
+		return -1;
+	}
+
+	return netlink_fd;
+}
+
+static int
+parse_event(const char *buf, struct rte_uevent *event)
+{
+	char action[RTE_UEVENT_MSG_LEN];
+	char subsystem[RTE_UEVENT_MSG_LEN];
+	char dev_path[RTE_UEVENT_MSG_LEN];
+	int i = 0;
+
+	memset(action, 0, RTE_UEVENT_MSG_LEN);
+	memset(subsystem, 0, RTE_UEVENT_MSG_LEN);
+	memset(dev_path, 0, RTE_UEVENT_MSG_LEN);
+
+	while (*buf && i < RTE_UEVENT_MSG_LEN) {
+		if (!strncmp(buf, "ACTION=", 7)) {
+			buf += 7;
+			snprintf(action, sizeof(action), "%s", buf);
+		} else if (!strncmp(buf, "DEVPATH=", 8)) {
+			buf += 8;
+			snprintf(dev_path, sizeof(dev_path), "%s", buf);
+		} else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
+			buf += 10;
+			snprintf(subsystem, sizeof(subsystem), "%s", buf);
+		}
+		while (*buf++)
+			i++;
+		while (*buf == '\0') {
+			buf++;
+			i++;
+		}
+	}
+
+	if (!strncmp(subsystem, "uio", 3)) {
+
+		event->subsystem = RTE_UEVENT_SUBSYSTEM_UIO;
+		if (!strncmp(action, "add", 3))
+			event->action = RTE_UEVENT_ADD;
+		if (!strncmp(action, "remove", 6))
+			event->action = RTE_UEVENT_REMOVE;
+		return 0;
+	}
+
+	return -1;
+}
+
+int
+rte_uevent_get(int fd, struct rte_uevent *uevent)
+{
+	int ret;
+	char buf[RTE_UEVENT_MSG_LEN];
+
+	memset(uevent, 0, sizeof(struct rte_uevent));
+	memset(buf, 0, RTE_UEVENT_MSG_LEN);
+
+	ret = recv(fd, buf, RTE_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
+	if (ret > 0)
+		return parse_event(buf, uevent);
+
+	if (ret < 0) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK) {
+			return 0;
+		} else {
+			RTE_LOG(ERR, EAL,
+			"Socket read error(%d): %s\n",
+			errno, strerror(errno));
+		}
+	}
+
+	/* connection closed */
+	if (ret == 0)
+		return -1;
+
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
index fa10329..2fead82 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
@@ -231,6 +231,10 @@ 
 		close(dev->intr_handle.uio_cfg_fd);
 		dev->intr_handle.uio_cfg_fd = -1;
 	}
+	if (dev->intr_handle.uevent_fd >= 0) {
+		close(dev->intr_handle.uevent_fd);
+		dev->intr_handle.uevent_fd = -1;
+	}
 	if (dev->intr_handle.fd >= 0) {
 		close(dev->intr_handle.fd);
 		dev->intr_handle.fd = -1;
@@ -245,6 +249,7 @@ 
 	char dirname[PATH_MAX];
 	char cfgname[PATH_MAX];
 	char devname[PATH_MAX]; /* contains the /dev/uioX */
+	char uevtname[PATH_MAX];
 	int uio_num;
 	struct rte_pci_addr *loc;
 
@@ -276,6 +281,13 @@ 
 		goto error;
 	}
 
+	dev->intr_handle.uevent_fd = rte_uevent_connect();
+	if (dev->intr_handle.uevent_fd < 0) {
+		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+			uevtname, strerror(errno));
+		goto error;
+	}
+
 	if (dev->kdrv == RTE_KDRV_IGB_UIO)
 		dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
 	else {
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 6daffeb..bd1780d 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -90,6 +90,7 @@  struct rte_intr_handle {
 					for uio_pci_generic */
 	};
 	int fd;	 /**< interrupt event file descriptor */
+	int uevent_fd;	 /**< uevent file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
 	uint32_t max_intr;             /**< max interrupt requested */
 	uint32_t nb_efd;               /**< number of available efd(event fd) */
@@ -99,6 +100,16 @@  struct rte_intr_handle {
 	int *intr_vec;                 /**< intr vector number array */
 };
 
+enum rte_uevent_action {
+	RTE_UEVENT_ADD = 0,		/**< uevent type of device add */
+	RTE_UEVENT_REMOVE = 1,	/**< uevent type of device remove*/
+};
+
+struct rte_uevent {
+	enum rte_uevent_action action;	/**< uevent action type */
+	int subsystem;				/**< subsystem id */
+};
+
 #define RTE_EPOLL_PER_THREAD        -1  /**< to hint using per thread epfd */
 
 /**
@@ -236,4 +247,27 @@  struct rte_intr_handle {
 int
 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle);
 
+/**
+ * It read out the uevent from the specific file descriptor.
+ *
+ * @param fd
+ *   The fd which the uevent associated to
+ * @param uevent
+ *   Pointer to the uevent which read from the monitoring fd.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_uevent_get(int fd, struct rte_uevent *uevent);
+
+/**
+ * Connect to the device uevent file descriptor.
+ * @return
+ *   - On success, the connected uevent fd.
+ *   - On failure, a negative value.
+ */
+int
+rte_uevent_connect(void);
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */