[dpdk-dev] vhost: fix vhost_user_set_mem_table error

Message ID 1505896323-125943-1-git-send-email-yi.y.yang@intel.com (mailing list archive)
State Changes Requested, archived
Delegated to: Yuanhan Liu
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Yi Yang Sept. 20, 2017, 8:32 a.m. UTC
  Usually vhost_user message VHOST_USER_SET_MEM_TABLE
is only sent out during initialization and only sent
once, but it isn't so for memory hotplug and hotunplug
, for that case, vhost_user message VHOST_USER_SET_MEM_TABLE
will be resent with the old memory regions (not hotunplugged)
and the new memory regions (hotplugged), so current
vhost_user_set_mem_table implementation is wrong for
memory hotplug and hotunplug case, it will free current
memory regions and unmap hugepages no matter if they
are be using or not and if they are memory regions which
have been initialized and mapped in the previous vhost_user
message VHOST_USER_SET_MEM_TABLE or not.

This commit fixed this issue very well, it will keep them
intact for those old memory region it will unmap those
memroy regions if they have been hotunplugged, it will
initialize the new hotplugged memory regions and map
hugepages if they are hotplugged.

vhost_user message VHOST_USER_SET_MEM_TABLE will include
all the current effective memory regions, the hotunplugged
memory regions won't be included in VHOST_USER_SET_MEM_TABLE,
the hotplugged memroy regions will be included in
VHOST_USER_SET_MEM_TABLE.

This has been verified in OVS DPDK by memory hotplug and
hotunplug in qemu.

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
---
 lib/librte_vhost/vhost_user.c | 72 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 65 insertions(+), 7 deletions(-)
  

Comments

Jianfeng Tan Sept. 22, 2017, 5:28 a.m. UTC | #1
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Yi Yang
> Sent: Wednesday, September 20, 2017 4:32 PM
> To: yliu@fridaylinux.org
> Cc: dev@dpdk.org; Yang, Yi Y
> Subject: [dpdk-dev] [PATCH] vhost: fix vhost_user_set_mem_table error
> 
> Usually vhost_user message VHOST_USER_SET_MEM_TABLE
> is only sent out during initialization and only sent
> once, but it isn't so for memory hotplug and hotunplug
> , for that case, vhost_user message VHOST_USER_SET_MEM_TABLE
> will be resent with the old memory regions (not hotunplugged)
> and the new memory regions (hotplugged), so current
> vhost_user_set_mem_table implementation is wrong for
> memory hotplug and hotunplug case, it will free current
> memory regions and unmap hugepages no matter if they
> are be using or not and if they are memory regions which
> have been initialized and mapped in the previous vhost_user
> message VHOST_USER_SET_MEM_TABLE or not.
> 
> This commit fixed this issue very well, it will keep them
> intact for those old memory region it will unmap those
> memroy regions if they have been hotunplugged, it will
> initialize the new hotplugged memory regions and map
> hugepages if they are hotplugged.
> 
> vhost_user message VHOST_USER_SET_MEM_TABLE will include
> all the current effective memory regions, the hotunplugged
> memory regions won't be included in VHOST_USER_SET_MEM_TABLE,
> the hotplugged memroy regions will be included in
> VHOST_USER_SET_MEM_TABLE.
> 
> This has been verified in OVS DPDK by memory hotplug and
> hotunplug in qemu.
> 
> Signed-off-by: Yi Yang <yi.y.yang@intel.com>
> ---
>  lib/librte_vhost/vhost_user.c | 72
> ++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 65 insertions(+), 7 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index ad2e8d3..1c475d1 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -509,17 +509,43 @@ vhost_user_set_mem_table(struct virtio_net *dev,
> struct VhostUserMsg *pmsg)
>  	uint64_t mmap_size;
>  	uint64_t mmap_offset;
>  	uint64_t alignment;
> -	uint32_t i;
> +	uint32_t i, j;
>  	int fd;
> 
> -	if (dev->mem) {
> -		free_mem_region(dev);
> -		rte_free(dev->mem);
> -		dev->mem = NULL;
> +	/* Handle hot unplug case */
> +	if (dev->mem && dev->mem->nregions) {
> +		i = 0;
> +		j = 0;
> +		while ((i < memory.nregions) && (j < dev->mem->nregions))
> {
> +			reg = &dev->mem->regions[j];
> +			/* munmap this region if it is hot unplugged */
> +			if (reg->guest_user_addr
> +				!= memory.regions[i].userspace_addr) {
> +				if (reg->host_user_addr) {
> +					munmap(reg->mmap_addr, reg-
> >mmap_size);
> +					close(reg->fd);
> +				}
> +				reg->guest_user_addr = 0;
> +				j++;
> +			} else {
> +				i++;
> +				j++;
> +			}
> +		}

The algorithm here could be problematic for hot plug. For example,

T0: we have two regions with address 1, 3.
T1: we add another region, 2, which makes the regions as 1, 2, 3.

1st iteration, 1 matches 1, i++, j++;
2nd iteration, 2 does not match 3, unmap 3, j++.

> +
> +		/* munmap these regions because they have been hot
> unplugged */
> +		for (; j < dev->mem->nregions; j++) {
> +			reg = &dev->mem->regions[j];
> +			if (reg->host_user_addr) {
> +				munmap(reg->mmap_addr, reg-
> >mmap_size);
> +				close(reg->fd);
> +			}
> +			reg->guest_user_addr = 0;
> +		}
>  	}
> 
> -	dev->nr_guest_pages = 0;
>  	if (!dev->guest_pages) {
> +		dev->nr_guest_pages = 0;

We cannot just move this line here, which results in a wrong nr_guest_pages if we do hot plug/unplug instead of initialization.

>  		dev->max_guest_pages = 8;
>  		dev->guest_pages = malloc(dev->max_guest_pages *
>  						sizeof(struct guest_page));
> @@ -532,7 +558,7 @@ vhost_user_set_mem_table(struct virtio_net *dev,
> struct VhostUserMsg *pmsg)
>  		}
>  	}
> 
> -	dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct
> rte_vhost_memory) +
> +	dev->mem = rte_realloc(dev->mem, sizeof(struct
> rte_vhost_memory) +
>  		sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
>  	if (dev->mem == NULL) {
>  		RTE_LOG(ERR, VHOST_CONFIG,
> @@ -546,6 +572,38 @@ vhost_user_set_mem_table(struct virtio_net *dev,
> struct VhostUserMsg *pmsg)
>  		fd  = pmsg->fds[i];
>  		reg = &dev->mem->regions[i];
> 
> +		/* This region should be skipped if it is initialized before */
> +		if (reg->guest_user_addr ==
> memory.regions[i].userspace_addr) {

Also problematic for hot unplug. For example,
T0: we have three regions, 1, 2, 3.
T1: remove region 2, with only 1, 3 left.

1st iteration, 1 matches 1, skip old region 1.
2nd iteration, 3 does not match 2, do the mmap.


> +			alignment = get_blk_size(fd);
> +			if (alignment == (uint64_t)-1) {
> +				RTE_LOG(ERR, VHOST_CONFIG,
> +					"couldn't get hugepage size "
> +					"through fstat\n");
> +				goto err_mmap;
> +			}
> +			RTE_LOG(INFO, VHOST_CONFIG,
> +				"guest memory region(old)"
> +				" %u, size: 0x%" PRIx64 "\n"
> +				"\t guest physical addr: 0x%" PRIx64 "\n"
> +				"\t guest virtual  addr: 0x%" PRIx64 "\n"
> +				"\t host  virtual  addr: 0x%" PRIx64 "\n"
> +				"\t mmap addr : 0x%" PRIx64 "\n"
> +				"\t mmap size : 0x%" PRIx64 "\n"
> +				"\t mmap align: 0x%" PRIx64 "\n"
> +				"\t mmap off  : 0x%" PRIx64 "\n",
> +				i, reg->size,
> +				reg->guest_phys_addr,
> +				reg->guest_user_addr,
> +				reg->host_user_addr,
> +				(uint64_t)(uintptr_t)reg->mmap_addr,
> +				reg->mmap_size,
> +				alignment,
> +				(uint64_t)(uintptr_t)reg->host_user_addr -
> +					(uint64_t)(uintptr_t)reg-
> >mmap_addr
> +				);
> +				continue;
> +		}
> +
>  		reg->guest_phys_addr =
> memory.regions[i].guest_phys_addr;
>  		reg->guest_user_addr = memory.regions[i].userspace_addr;
>  		reg->size            = memory.regions[i].memory_size;
> --
> 2.1.0
  
Yi Yang Sept. 25, 2017, 2:29 a.m. UTC | #2
On Fri, Sep 22, 2017 at 01:28:48PM +0800, Tan, Jianfeng wrote:
> 
> 
> > +				}
> > +				reg->guest_user_addr = 0;
> > +				j++;
> > +			} else {
> > +				i++;
> > +				j++;
> > +			}
> > +		}
> 
> The algorithm here could be problematic for hot plug. For example,
> 
> T0: we have two regions with address 1, 3.
> T1: we add another region, 2, which makes the regions as 1, 2, 3.
> 
> 1st iteration, 1 matches 1, i++, j++;
> 2nd iteration, 2 does not match 3, unmap 3, j++.

Jianfeng, thank you so much for your comments, but per my understanding
such hot add is impossible, this will result in discontinuous physical
address. Do you know how we can trigger such use case?

> 
> > +
> > +		/* munmap these regions because they have been hot
> > unplugged */
> > +		for (; j < dev->mem->nregions; j++) {
> > +			reg = &dev->mem->regions[j];
> > +			if (reg->host_user_addr) {
> > +				munmap(reg->mmap_addr, reg-
> > >mmap_size);
> > +				close(reg->fd);
> > +			}
> > +			reg->guest_user_addr = 0;
> > +		}
> >  	}
> > 
> > -	dev->nr_guest_pages = 0;
> >  	if (!dev->guest_pages) {
> > +		dev->nr_guest_pages = 0;
> 
> We cannot just move this line here, which results in a wrong nr_guest_pages if we do hot plug/unplug instead of initialization.

For initialization, dev->guest_pages is 0, for hot plug/unplug,
dev->guest_pages isn't 0, so it is ok.

> 
> >  		dev->max_guest_pages = 8;
> >  		dev->guest_pages = malloc(dev->max_guest_pages *
> >  						sizeof(struct guest_page));
> > @@ -532,7 +558,7 @@ vhost_user_set_mem_table(struct virtio_net *dev,
> > struct VhostUserMsg *pmsg)
> >  		}
> >  	}
> > 
> > -	dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct
> > rte_vhost_memory) +
> > +	dev->mem = rte_realloc(dev->mem, sizeof(struct
> > rte_vhost_memory) +
> >  		sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
> >  	if (dev->mem == NULL) {
> >  		RTE_LOG(ERR, VHOST_CONFIG,
> > @@ -546,6 +572,38 @@ vhost_user_set_mem_table(struct virtio_net *dev,
> > struct VhostUserMsg *pmsg)
> >  		fd  = pmsg->fds[i];
> >  		reg = &dev->mem->regions[i];
> > 
> > +		/* This region should be skipped if it is initialized before */
> > +		if (reg->guest_user_addr ==
> > memory.regions[i].userspace_addr) {
> 
> Also problematic for hot unplug. For example,
> T0: we have three regions, 1, 2, 3.
> T1: remove region 2, with only 1, 3 left.
> 
> 1st iteration, 1 matches 1, skip old region 1.
> 2nd iteration, 3 does not match 2, do the mmap.

It is ok, I have verified it, let us have a discussion in meeting.
  
Tiwei Bie Sept. 25, 2017, 4:46 a.m. UTC | #3
Hi Yi,

On Wed, Sep 20, 2017 at 04:32:03PM +0800, Yi Yang wrote:
> Usually vhost_user message VHOST_USER_SET_MEM_TABLE
> is only sent out during initialization and only sent
> once, but it isn't so for memory hotplug and hotunplug
> , for that case, vhost_user message VHOST_USER_SET_MEM_TABLE
> will be resent with the old memory regions (not hotunplugged)
> and the new memory regions (hotplugged), so current
> vhost_user_set_mem_table implementation is wrong for
> memory hotplug and hotunplug case, it will free current
> memory regions and unmap hugepages no matter if they
> are be using or not and if they are memory regions which
> have been initialized and mapped in the previous vhost_user
> message VHOST_USER_SET_MEM_TABLE or not.
> 
> This commit fixed this issue very well, it will keep them
> intact for those old memory region it will unmap those
> memroy regions if they have been hotunplugged, it will
> initialize the new hotplugged memory regions and map
> hugepages if they are hotplugged.
> 
> vhost_user message VHOST_USER_SET_MEM_TABLE will include
> all the current effective memory regions, the hotunplugged
> memory regions won't be included in VHOST_USER_SET_MEM_TABLE,
> the hotplugged memroy regions will be included in
> VHOST_USER_SET_MEM_TABLE.
> 
> This has been verified in OVS DPDK by memory hotplug and
> hotunplug in qemu.
> 
> Signed-off-by: Yi Yang <yi.y.yang@intel.com>
> ---
>  lib/librte_vhost/vhost_user.c | 72 ++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 65 insertions(+), 7 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index ad2e8d3..1c475d1 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
[...]
> @@ -532,7 +558,7 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
>  		}
>  	}
>  
> -	dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
> +	dev->mem = rte_realloc(dev->mem, sizeof(struct rte_vhost_memory) +

The rte_realloc() will free the memory pointed by the dev->mem.
But it's possible that some other threads are using it (e.g. by
calling rte_vhost_gpa_to_vva()). It's an issue that needs to be
fixed in this patch.

Best regards,
Tiwei Bie
  

Patch

diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index ad2e8d3..1c475d1 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -509,17 +509,43 @@  vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 	uint64_t mmap_size;
 	uint64_t mmap_offset;
 	uint64_t alignment;
-	uint32_t i;
+	uint32_t i, j;
 	int fd;
 
-	if (dev->mem) {
-		free_mem_region(dev);
-		rte_free(dev->mem);
-		dev->mem = NULL;
+	/* Handle hot unplug case */
+	if (dev->mem && dev->mem->nregions) {
+		i = 0;
+		j = 0;
+		while ((i < memory.nregions) && (j < dev->mem->nregions)) {
+			reg = &dev->mem->regions[j];
+			/* munmap this region if it is hot unplugged */
+			if (reg->guest_user_addr
+				!= memory.regions[i].userspace_addr) {
+				if (reg->host_user_addr) {
+					munmap(reg->mmap_addr, reg->mmap_size);
+					close(reg->fd);
+				}
+				reg->guest_user_addr = 0;
+				j++;
+			} else {
+				i++;
+				j++;
+			}
+		}
+
+		/* munmap these regions because they have been hot unplugged */
+		for (; j < dev->mem->nregions; j++) {
+			reg = &dev->mem->regions[j];
+			if (reg->host_user_addr) {
+				munmap(reg->mmap_addr, reg->mmap_size);
+				close(reg->fd);
+			}
+			reg->guest_user_addr = 0;
+		}
 	}
 
-	dev->nr_guest_pages = 0;
 	if (!dev->guest_pages) {
+		dev->nr_guest_pages = 0;
 		dev->max_guest_pages = 8;
 		dev->guest_pages = malloc(dev->max_guest_pages *
 						sizeof(struct guest_page));
@@ -532,7 +558,7 @@  vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 		}
 	}
 
-	dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
+	dev->mem = rte_realloc(dev->mem, sizeof(struct rte_vhost_memory) +
 		sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
 	if (dev->mem == NULL) {
 		RTE_LOG(ERR, VHOST_CONFIG,
@@ -546,6 +572,38 @@  vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 		fd  = pmsg->fds[i];
 		reg = &dev->mem->regions[i];
 
+		/* This region should be skipped if it is initialized before */
+		if (reg->guest_user_addr == memory.regions[i].userspace_addr) {
+			alignment = get_blk_size(fd);
+			if (alignment == (uint64_t)-1) {
+				RTE_LOG(ERR, VHOST_CONFIG,
+					"couldn't get hugepage size "
+					"through fstat\n");
+				goto err_mmap;
+			}
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"guest memory region(old)"
+				" %u, size: 0x%" PRIx64 "\n"
+				"\t guest physical addr: 0x%" PRIx64 "\n"
+				"\t guest virtual  addr: 0x%" PRIx64 "\n"
+				"\t host  virtual  addr: 0x%" PRIx64 "\n"
+				"\t mmap addr : 0x%" PRIx64 "\n"
+				"\t mmap size : 0x%" PRIx64 "\n"
+				"\t mmap align: 0x%" PRIx64 "\n"
+				"\t mmap off  : 0x%" PRIx64 "\n",
+				i, reg->size,
+				reg->guest_phys_addr,
+				reg->guest_user_addr,
+				reg->host_user_addr,
+				(uint64_t)(uintptr_t)reg->mmap_addr,
+				reg->mmap_size,
+				alignment,
+				(uint64_t)(uintptr_t)reg->host_user_addr -
+					(uint64_t)(uintptr_t)reg->mmap_addr
+				);
+				continue;
+		}
+
 		reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
 		reg->guest_user_addr = memory.regions[i].userspace_addr;
 		reg->size            = memory.regions[i].memory_size;