[v2,3/3] eal/linux: handle uio read failure in interrupt handler

Message ID 20180919125757.17938-3-bluca@debian.org (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v2,1/3] net/virtio: register/unregister intr handler on start/stop |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Luca Boccassi Sept. 19, 2018, 12:57 p.m. UTC
  If a device is unplugged while an interrupt is pending, the
read call to the uio device to remove it from the poll wait list
can fail resulting in it being continually polled forever. This
change checks for the read failing and if so, unregisters the device
as an interrupt source and causes the wait list to be rebuilt.

This race has been reported and observed in production.

Fixes: 0a45657a6794 ("pci: rework interrupt handling")
Cc: stable@dpdk.org

Signed-off-by: Brian Russell <brussell@brocade.com>
Signed-off-by: Luca Boccassi <bluca@debian.org>
---
 lib/librte_eal/linuxapp/eal/eal_interrupts.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)
  

Comments

Thomas Monjalon Oct. 11, 2018, 10:32 a.m. UTC | #1
Looking for someone to review this patch please


19/09/2018 14:57, Luca Boccassi:
> If a device is unplugged while an interrupt is pending, the
> read call to the uio device to remove it from the poll wait list
> can fail resulting in it being continually polled forever. This
> change checks for the read failing and if so, unregisters the device
> as an interrupt source and causes the wait list to be rebuilt.
> 
> This race has been reported and observed in production.
> 
> Fixes: 0a45657a6794 ("pci: rework interrupt handling")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Brian Russell <brussell@brocade.com>
> Signed-off-by: Luca Boccassi <bluca@debian.org>
> ---
>  lib/librte_eal/linuxapp/eal/eal_interrupts.c | 19 ++++++++++++++++++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> index 4076c6d6ca..34584db883 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> @@ -627,7 +627,7 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
>  	bool call = false;
>  	int n, bytes_read;
>  	struct rte_intr_source *src;
> -	struct rte_intr_callback *cb;
> +	struct rte_intr_callback *cb, *next;
>  	union rte_intr_read_buffer buf;
>  	struct rte_intr_callback active_cb;
>  
> @@ -701,6 +701,23 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
>  					"descriptor %d: %s\n",
>  					events[n].data.fd,
>  					strerror(errno));
> +				/*
> +				 * The device is unplugged or buggy, remove
> +				 * it as an interrupt source and return to
> +				 * force the wait list to be rebuilt.
> +				 */
> +				rte_spinlock_lock(&intr_lock);
> +				TAILQ_REMOVE(&intr_sources, src, next);
> +				rte_spinlock_unlock(&intr_lock);
> +
> +				for (cb = TAILQ_FIRST(&src->callbacks); cb;
> +							cb = next) {
> +					next = TAILQ_NEXT(cb, next);
> +					TAILQ_REMOVE(&src->callbacks, cb, next);
> +					free(cb);
> +				}
> +				free(src);
> +				return -1;
>  			} else if (bytes_read == 0)
>  				RTE_LOG(ERR, EAL, "Read nothing from file "
>  					"descriptor %d\n", events[n].data.fd);
>
  

Patch

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 4076c6d6ca..34584db883 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -627,7 +627,7 @@  eal_intr_process_interrupts(struct epoll_event *events, int nfds)
 	bool call = false;
 	int n, bytes_read;
 	struct rte_intr_source *src;
-	struct rte_intr_callback *cb;
+	struct rte_intr_callback *cb, *next;
 	union rte_intr_read_buffer buf;
 	struct rte_intr_callback active_cb;
 
@@ -701,6 +701,23 @@  eal_intr_process_interrupts(struct epoll_event *events, int nfds)
 					"descriptor %d: %s\n",
 					events[n].data.fd,
 					strerror(errno));
+				/*
+				 * The device is unplugged or buggy, remove
+				 * it as an interrupt source and return to
+				 * force the wait list to be rebuilt.
+				 */
+				rte_spinlock_lock(&intr_lock);
+				TAILQ_REMOVE(&intr_sources, src, next);
+				rte_spinlock_unlock(&intr_lock);
+
+				for (cb = TAILQ_FIRST(&src->callbacks); cb;
+							cb = next) {
+					next = TAILQ_NEXT(cb, next);
+					TAILQ_REMOVE(&src->callbacks, cb, next);
+					free(cb);
+				}
+				free(src);
+				return -1;
 			} else if (bytes_read == 0)
 				RTE_LOG(ERR, EAL, "Read nothing from file "
 					"descriptor %d\n", events[n].data.fd);