[dpdk-dev,v2,1/5] mem: add --single-file to create single mem-backed file

Message ID 1454671228-33284-2-git-send-email-jianfeng.tan@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Commit Message

Jianfeng Tan Feb. 5, 2016, 11:20 a.m. UTC
  Originally, there're two cons in using hugepage: a. needs root
privilege to touch /proc/self/pagemap, which is a premise to
alllocate physically contiguous memseg; b. possibly too many
hugepage file are created, especially used with 2M hugepage.

For virtual devices, they don't care about physical-contiguity
of allocated hugepages at all. Option --single-file is to
provide a way to allocate all hugepages into single mem-backed
file.

Known issue:
a. single-file option relys on kernel to allocate numa-affinitive
memory.
b. possible ABI break, originally, --no-huge uses anonymous memory
instead of file-backed way to create memory.

Signed-off-by: Huawei Xie <huawei.xie@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_options.c | 17 ++++++++++
 lib/librte_eal/common/eal_internal_cfg.h   |  1 +
 lib/librte_eal/common/eal_options.h        |  2 ++
 lib/librte_eal/linuxapp/eal/eal.c          |  4 +--
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 50 +++++++++++++++++++++++++-----
 5 files changed, 64 insertions(+), 10 deletions(-)
  

Comments

Yuanhan Liu March 7, 2016, 1:13 p.m. UTC | #1
CC'ed EAL hugepage maintainer, which is something you should do when
send a patch.

On Fri, Feb 05, 2016 at 07:20:24PM +0800, Jianfeng Tan wrote:
> Originally, there're two cons in using hugepage: a. needs root
> privilege to touch /proc/self/pagemap, which is a premise to
> alllocate physically contiguous memseg; b. possibly too many
> hugepage file are created, especially used with 2M hugepage.
> 
> For virtual devices, they don't care about physical-contiguity
> of allocated hugepages at all. Option --single-file is to
> provide a way to allocate all hugepages into single mem-backed
> file.
> 
> Known issue:
> a. single-file option relys on kernel to allocate numa-affinitive
> memory.
> b. possible ABI break, originally, --no-huge uses anonymous memory
> instead of file-backed way to create memory.
> 
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
...
> @@ -956,6 +961,16 @@ eal_check_common_options(struct internal_config *internal_cfg)
>  			"be specified together with --"OPT_NO_HUGE"\n");
>  		return -1;
>  	}
> +	if (internal_cfg->single_file && internal_cfg->force_sockets == 1) {
> +		RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE" cannot "
> +			"be specified together with --"OPT_SOCKET_MEM"\n");
> +		return -1;
> +	}
> +	if (internal_cfg->single_file && internal_cfg->hugepage_unlink) {
> +		RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot "
> +			"be specified together with --"OPT_SINGLE_FILE"\n");
> +		return -1;
> +	}

The two limitation doesn't make sense to me.

> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 6008533..68ef49a 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -1102,20 +1102,54 @@ rte_eal_hugepage_init(void)
>  	/* get pointer to global configuration */
>  	mcfg = rte_eal_get_configuration()->mem_config;
>  
> -	/* hugetlbfs can be disabled */
> -	if (internal_config.no_hugetlbfs) {
> -		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
> -				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> +	/* when hugetlbfs is disabled or single-file option is specified */
> +	if (internal_config.no_hugetlbfs || internal_config.single_file) {
> +		int fd;
> +		uint64_t pagesize;
> +		unsigned socket_id = rte_socket_id();
> +		char filepath[MAX_HUGEPAGE_PATH];
> +
> +		if (internal_config.no_hugetlbfs) {
> +			eal_get_hugefile_path(filepath, sizeof(filepath),
> +					      "/dev/shm", 0);
> +			pagesize = RTE_PGSIZE_4K;
> +		} else {
> +			struct hugepage_info *hpi;
> +
> +			hpi = &internal_config.hugepage_info[0];
> +			eal_get_hugefile_path(filepath, sizeof(filepath),
> +					      hpi->hugedir, 0);
> +			pagesize = hpi->hugepage_sz;
> +		}
> +		fd = open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
> +		if (fd < 0) {
> +			RTE_LOG(ERR, EAL, "%s: open %s failed: %s\n",
> +				__func__, filepath, strerror(errno));
> +			return -1;
> +		}
> +
> +		if (ftruncate(fd, internal_config.memory) < 0) {
> +			RTE_LOG(ERR, EAL, "ftuncate %s failed: %s\n",
> +				filepath, strerror(errno));
> +			return -1;
> +		}
> +
> +		addr = mmap(NULL, internal_config.memory,
> +			    PROT_READ | PROT_WRITE,
> +			    MAP_SHARED | MAP_POPULATE, fd, 0);
>  		if (addr == MAP_FAILED) {
> -			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
> -					strerror(errno));
> +			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n",
> +				__func__, strerror(errno));
>  			return -1;
>  		}
>  		mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
>  		mcfg->memseg[0].addr = addr;
> -		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
> +		mcfg->memseg[0].hugepage_sz = pagesize;
>  		mcfg->memseg[0].len = internal_config.memory;
> -		mcfg->memseg[0].socket_id = 0;
> +		mcfg->memseg[0].socket_id = socket_id;

I saw quite few issues:

- Assume I have a system with two hugepage sizes: 1G (x4) and 2M (x512),
  mounted at /dev/hugepages and /mnt, respectively.

  Here we then got an 5G internal_config.memory, and your code will
  try to mmap 5G on the first mount point (/dev/hugepages) due to the
  hardcode logic in your code:

      hpi = &internal_config.hugepage_info[0];
      eal_get_hugefile_path(filepath, sizeof(filepath),
      		      hpi->hugedir, 0);

  But it has 4G in total, therefore, it will fails.

- As you stated, socket_id is hardcoded, which could be wrong.

- As stated in above, the option limitation doesn't seem right to me.

  I mean, --single-file should be able to work with --socket-mem option
  in semantic.


And I have been thinking how to deal with those issues properly, and a
__very immature__ solution come to my mind (which could be simply not
working), but anyway, here is FYI: we go through the same process to
handle normal huge page initilization to --single-file option as well.
But we take different actions or no actions at all at some stages when
that option is given, which is a bit similiar with the way of handling
RTE_EAL_SINGLE_FILE_SEGMENTS.

And we create one hugepage file for each node, each page size. For a
system like mine above (2 nodes), it may populate files like following:

- 1G x 2 on node0
- 1G x 2 on node1
- 2M x 256 on node0
- 2M x 256 on node1

That could normally fit your case. Though 4 nodes looks like the maximum
node number, --socket-mem option may relieve the limit a bit.

And if we "could" not care the socket_id being set correctly, we could
just simply allocate one file for each hugepage size. That would work
well for your container enabling.

BTW, since we already have SINGLE_FILE_SEGMENTS (config) option, adding
another option --single-file looks really confusing to me.

To me, maybe you could base the SINGLE_FILE_SEGMENTS option, and add
another option, say --no-sort (I confess this name sucks, but you get
my point). With that, we could make sure to create as least huge page
files as possible, to fit your case.

	--yliu
  
Jianfeng Tan March 8, 2016, 1:55 a.m. UTC | #2
Hi Yuanhan,

On 3/7/2016 9:13 PM, Yuanhan Liu wrote:
> CC'ed EAL hugepage maintainer, which is something you should do when
> send a patch.

Thanks for doing this.

>
> On Fri, Feb 05, 2016 at 07:20:24PM +0800, Jianfeng Tan wrote:
>> Originally, there're two cons in using hugepage: a. needs root
>> privilege to touch /proc/self/pagemap, which is a premise to
>> alllocate physically contiguous memseg; b. possibly too many
>> hugepage file are created, especially used with 2M hugepage.
>>
>> For virtual devices, they don't care about physical-contiguity
>> of allocated hugepages at all. Option --single-file is to
>> provide a way to allocate all hugepages into single mem-backed
>> file.
>>
>> Known issue:
>> a. single-file option relys on kernel to allocate numa-affinitive
>> memory.
>> b. possible ABI break, originally, --no-huge uses anonymous memory
>> instead of file-backed way to create memory.
>>
>> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ...
>> @@ -956,6 +961,16 @@ eal_check_common_options(struct internal_config *internal_cfg)
>>   			"be specified together with --"OPT_NO_HUGE"\n");
>>   		return -1;
>>   	}
>> +	if (internal_cfg->single_file && internal_cfg->force_sockets == 1) {
>> +		RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE" cannot "
>> +			"be specified together with --"OPT_SOCKET_MEM"\n");
>> +		return -1;
>> +	}
>> +	if (internal_cfg->single_file && internal_cfg->hugepage_unlink) {
>> +		RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot "
>> +			"be specified together with --"OPT_SINGLE_FILE"\n");
>> +		return -1;
>> +	}
> The two limitation doesn't make sense to me.

For the force_sockets option, my original thought on --single-file 
option is, we don't sort those pages (require root/cap_sys_admin) and 
even don't look up numa information because it may contain both sockets' 
memory.

For the hugepage_unlink option, those hugepage files get closed in the 
end of memory initialization, if we even unlink those hugepage files, so 
we cannot share those with other processes (say backend).

>
>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> index 6008533..68ef49a 100644
>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> @@ -1102,20 +1102,54 @@ rte_eal_hugepage_init(void)
>>   	/* get pointer to global configuration */
>>   	mcfg = rte_eal_get_configuration()->mem_config;
>>   
>> -	/* hugetlbfs can be disabled */
>> -	if (internal_config.no_hugetlbfs) {
>> -		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
>> -				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
>> +	/* when hugetlbfs is disabled or single-file option is specified */
>> +	if (internal_config.no_hugetlbfs || internal_config.single_file) {
>> +		int fd;
>> +		uint64_t pagesize;
>> +		unsigned socket_id = rte_socket_id();
>> +		char filepath[MAX_HUGEPAGE_PATH];
>> +
>> +		if (internal_config.no_hugetlbfs) {
>> +			eal_get_hugefile_path(filepath, sizeof(filepath),
>> +					      "/dev/shm", 0);
>> +			pagesize = RTE_PGSIZE_4K;
>> +		} else {
>> +			struct hugepage_info *hpi;
>> +
>> +			hpi = &internal_config.hugepage_info[0];
>> +			eal_get_hugefile_path(filepath, sizeof(filepath),
>> +					      hpi->hugedir, 0);
>> +			pagesize = hpi->hugepage_sz;
>> +		}
>> +		fd = open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
>> +		if (fd < 0) {
>> +			RTE_LOG(ERR, EAL, "%s: open %s failed: %s\n",
>> +				__func__, filepath, strerror(errno));
>> +			return -1;
>> +		}
>> +
>> +		if (ftruncate(fd, internal_config.memory) < 0) {
>> +			RTE_LOG(ERR, EAL, "ftuncate %s failed: %s\n",
>> +				filepath, strerror(errno));
>> +			return -1;
>> +		}
>> +
>> +		addr = mmap(NULL, internal_config.memory,
>> +			    PROT_READ | PROT_WRITE,
>> +			    MAP_SHARED | MAP_POPULATE, fd, 0);
>>   		if (addr == MAP_FAILED) {
>> -			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
>> -					strerror(errno));
>> +			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n",
>> +				__func__, strerror(errno));
>>   			return -1;
>>   		}
>>   		mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
>>   		mcfg->memseg[0].addr = addr;
>> -		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
>> +		mcfg->memseg[0].hugepage_sz = pagesize;
>>   		mcfg->memseg[0].len = internal_config.memory;
>> -		mcfg->memseg[0].socket_id = 0;
>> +		mcfg->memseg[0].socket_id = socket_id;
> I saw quite few issues:
>
> - Assume I have a system with two hugepage sizes: 1G (x4) and 2M (x512),
>    mounted at /dev/hugepages and /mnt, respectively.
>
>    Here we then got an 5G internal_config.memory, and your code will
>    try to mmap 5G on the first mount point (/dev/hugepages) due to the
>    hardcode logic in your code:
>
>        hpi = &internal_config.hugepage_info[0];
>        eal_get_hugefile_path(filepath, sizeof(filepath),
>        		      hpi->hugedir, 0);
>
>    But it has 4G in total, therefore, it will fails.

As mentioned above, this case is not for original design of --single-file.

>
> - As you stated, socket_id is hardcoded, which could be wrong.

We rely on OS to allocate hugepages, and cannot promise physical 
hugepages in the big hugepage file are from the same socket.

>
> - As stated in above, the option limitation doesn't seem right to me.
>
>    I mean, --single-file should be able to work with --socket-mem option
>    in semantic.

If we'd like to work well with --socket-mem option, we need to use 
syscalls like set_mempolicy(), mbind(). So it'll bring bigger change 
related to current one. I don't know if it's acceptable?

>
>
> And I have been thinking how to deal with those issues properly, and a
> __very immature__ solution come to my mind (which could be simply not
> working), but anyway, here is FYI: we go through the same process to
> handle normal huge page initilization to --single-file option as well.
> But we take different actions or no actions at all at some stages when
> that option is given, which is a bit similiar with the way of handling
> RTE_EAL_SINGLE_FILE_SEGMENTS.
>
> And we create one hugepage file for each node, each page size. For a
> system like mine above (2 nodes), it may populate files like following:
>
> - 1G x 2 on node0
> - 1G x 2 on node1
> - 2M x 256 on node0
> - 2M x 256 on node1
>
> That could normally fit your case. Though 4 nodes looks like the maximum
> node number, --socket-mem option may relieve the limit a bit.
>
> And if we "could" not care the socket_id being set correctly, we could
> just simply allocate one file for each hugepage size. That would work
> well for your container enabling.

This way seems a good option at first sight. Let's compare this new way 
with original design.

The original design just covers the simplest scenario:
a. just one hugetlbfs (new way can provide support for multiple number 
of hugetlbfs)
b. does not require a root privilege (new way can achieve this by using 
above-mentioned mind() or set_mempolicy() syscall)
c. no sorting (both way are OK)
d. performance, from the perspective of virtio for container, we take 
more consideration about the performance of address translation in the 
vhost. In the vhost, now we adopt a O(n) linear comparison to translate 
address (this can be optimized to O(logn) using segment tree, or even 
better using a cache, sorry, it's just another problem), so we should 
maintain as few files as possible. (new way can achieve this by used 
with --socket-mem, --huge-dir)
e. numa aware is not required (and it's complex). (new way can solve 
this without promise)

In all, this new way seems great for me.

Another thing is if "go through the same process to handle normal huge 
page initilization", my consideration is: RTE_EAL_SINGLE_FILE_SEGMENTS 
goes such way to maximize code reuse. But the new way has few common 
code with original ways. And mixing these options together leads to bad 
readability. How do you think?

>
> BTW, since we already have SINGLE_FILE_SEGMENTS (config) option, adding
> another option --single-file looks really confusing to me.
>
> To me, maybe you could base the SINGLE_FILE_SEGMENTS option, and add
> another option, say --no-sort (I confess this name sucks, but you get
> my point). With that, we could make sure to create as least huge page
> files as possible, to fit your case.

This is a great advice. So how do you think of --converged, or 
--no-scattered-mem, or any better idea?

Thanks for valuable input.

Jianfeng

>
> 	--yliu
  
Yuanhan Liu March 8, 2016, 2:44 a.m. UTC | #3
On Tue, Mar 08, 2016 at 09:55:10AM +0800, Tan, Jianfeng wrote:
> Hi Yuanhan,
> 
> On 3/7/2016 9:13 PM, Yuanhan Liu wrote:
> >CC'ed EAL hugepage maintainer, which is something you should do when
> >send a patch.
> 
> Thanks for doing this.
> 
> >
> >On Fri, Feb 05, 2016 at 07:20:24PM +0800, Jianfeng Tan wrote:
> >>Originally, there're two cons in using hugepage: a. needs root
> >>privilege to touch /proc/self/pagemap, which is a premise to
> >>alllocate physically contiguous memseg; b. possibly too many
> >>hugepage file are created, especially used with 2M hugepage.
> >>
> >>For virtual devices, they don't care about physical-contiguity
> >>of allocated hugepages at all. Option --single-file is to
> >>provide a way to allocate all hugepages into single mem-backed
> >>file.
> >>
> >>Known issue:
> >>a. single-file option relys on kernel to allocate numa-affinitive
> >>memory.
> >>b. possible ABI break, originally, --no-huge uses anonymous memory
> >>instead of file-backed way to create memory.
> >>
> >>Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> >>Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> >...
> >>@@ -956,6 +961,16 @@ eal_check_common_options(struct internal_config *internal_cfg)
> >>  			"be specified together with --"OPT_NO_HUGE"\n");
> >>  		return -1;
> >>  	}
> >>+	if (internal_cfg->single_file && internal_cfg->force_sockets == 1) {
> >>+		RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE" cannot "
> >>+			"be specified together with --"OPT_SOCKET_MEM"\n");
> >>+		return -1;
> >>+	}
> >>+	if (internal_cfg->single_file && internal_cfg->hugepage_unlink) {
> >>+		RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot "
> >>+			"be specified together with --"OPT_SINGLE_FILE"\n");
> >>+		return -1;
> >>+	}
> >The two limitation doesn't make sense to me.
> 
> For the force_sockets option, my original thought on --single-file option
> is, we don't sort those pages (require root/cap_sys_admin) and even don't
> look up numa information because it may contain both sockets' memory.
> 
> For the hugepage_unlink option, those hugepage files get closed in the end
> of memory initialization, if we even unlink those hugepage files, so we
> cannot share those with other processes (say backend).

Yeah, I know how the two limitations come, from your implementation. I
was just wondering if they both are __truly__ the limitations. I mean,
can we get rid of them somehow?

For --socket-mem option, if we can't handle it well, or if we could
ignore the socket_id for allocated huge page, yes, the limitation is
a true one.

But for the second option, no, we should be able to co-work it with
well. One extra action is you should not invoke "close(fd)" for those
huge page files. And then you can get all the informations as I stated
in a reply to your 2nd patch.

> >
> >>diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> >>index 6008533..68ef49a 100644
> >>--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> >>+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> >>@@ -1102,20 +1102,54 @@ rte_eal_hugepage_init(void)
> >>  	/* get pointer to global configuration */
> >>  	mcfg = rte_eal_get_configuration()->mem_config;
> >>-	/* hugetlbfs can be disabled */
> >>-	if (internal_config.no_hugetlbfs) {
> >>-		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
> >>-				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> >>+	/* when hugetlbfs is disabled or single-file option is specified */
> >>+	if (internal_config.no_hugetlbfs || internal_config.single_file) {
> >>+		int fd;
> >>+		uint64_t pagesize;
> >>+		unsigned socket_id = rte_socket_id();
> >>+		char filepath[MAX_HUGEPAGE_PATH];
> >>+
> >>+		if (internal_config.no_hugetlbfs) {
> >>+			eal_get_hugefile_path(filepath, sizeof(filepath),
> >>+					      "/dev/shm", 0);
> >>+			pagesize = RTE_PGSIZE_4K;
> >>+		} else {
> >>+			struct hugepage_info *hpi;
> >>+
> >>+			hpi = &internal_config.hugepage_info[0];
> >>+			eal_get_hugefile_path(filepath, sizeof(filepath),
> >>+					      hpi->hugedir, 0);
> >>+			pagesize = hpi->hugepage_sz;
> >>+		}
> >>+		fd = open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
> >>+		if (fd < 0) {
> >>+			RTE_LOG(ERR, EAL, "%s: open %s failed: %s\n",
> >>+				__func__, filepath, strerror(errno));
> >>+			return -1;
> >>+		}
> >>+
> >>+		if (ftruncate(fd, internal_config.memory) < 0) {
> >>+			RTE_LOG(ERR, EAL, "ftuncate %s failed: %s\n",
> >>+				filepath, strerror(errno));
> >>+			return -1;
> >>+		}
> >>+
> >>+		addr = mmap(NULL, internal_config.memory,
> >>+			    PROT_READ | PROT_WRITE,
> >>+			    MAP_SHARED | MAP_POPULATE, fd, 0);
> >>  		if (addr == MAP_FAILED) {
> >>-			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
> >>-					strerror(errno));
> >>+			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n",
> >>+				__func__, strerror(errno));
> >>  			return -1;
> >>  		}
> >>  		mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
> >>  		mcfg->memseg[0].addr = addr;
> >>-		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
> >>+		mcfg->memseg[0].hugepage_sz = pagesize;
> >>  		mcfg->memseg[0].len = internal_config.memory;
> >>-		mcfg->memseg[0].socket_id = 0;
> >>+		mcfg->memseg[0].socket_id = socket_id;
> >I saw quite few issues:
> >
> >- Assume I have a system with two hugepage sizes: 1G (x4) and 2M (x512),
> >   mounted at /dev/hugepages and /mnt, respectively.
> >
> >   Here we then got an 5G internal_config.memory, and your code will
> >   try to mmap 5G on the first mount point (/dev/hugepages) due to the
> >   hardcode logic in your code:
> >
> >       hpi = &internal_config.hugepage_info[0];
> >       eal_get_hugefile_path(filepath, sizeof(filepath),
> >       		      hpi->hugedir, 0);
> >
> >   But it has 4G in total, therefore, it will fails.
> 
> As mentioned above, this case is not for original design of --single-file.

But it's a so common case, isn't it?

> >
> >- As you stated, socket_id is hardcoded, which could be wrong.
> 
> We rely on OS to allocate hugepages, and cannot promise physical hugepages
> in the big hugepage file are from the same socket.
> 
> >
> >- As stated in above, the option limitation doesn't seem right to me.
> >
> >   I mean, --single-file should be able to work with --socket-mem option
> >   in semantic.
> 
> If we'd like to work well with --socket-mem option, we need to use syscalls
> like set_mempolicy(), mbind(). So it'll bring bigger change related to
> current one. I don't know if it's acceptable?

Yes, if that's the right way to go. But also as you stated, I doubt we
really need handle the numa affinitive here, due to it's complex.

> >
> >
> >And I have been thinking how to deal with those issues properly, and a
> >__very immature__ solution come to my mind (which could be simply not
> >working), but anyway, here is FYI: we go through the same process to
> >handle normal huge page initilization to --single-file option as well.
> >But we take different actions or no actions at all at some stages when
> >that option is given, which is a bit similiar with the way of handling
> >RTE_EAL_SINGLE_FILE_SEGMENTS.
> >
> >And we create one hugepage file for each node, each page size. For a
> >system like mine above (2 nodes), it may populate files like following:
> >
> >- 1G x 2 on node0
> >- 1G x 2 on node1
> >- 2M x 256 on node0
> >- 2M x 256 on node1
> >
> >That could normally fit your case. Though 4 nodes looks like the maximum
> >node number, --socket-mem option may relieve the limit a bit.
> >
> >And if we "could" not care the socket_id being set correctly, we could
> >just simply allocate one file for each hugepage size. That would work
> >well for your container enabling.
> 
> This way seems a good option at first sight. Let's compare this new way with
> original design.
> 
> The original design just covers the simplest scenario:
> a. just one hugetlbfs (new way can provide support for multiple number of
> hugetlbfs)
> b. does not require a root privilege (new way can achieve this by using
> above-mentioned mind() or set_mempolicy() syscall)
> c. no sorting (both way are OK)
> d. performance, from the perspective of virtio for container, we take more
> consideration about the performance of address translation in the vhost. In
> the vhost, now we adopt a O(n) linear comparison to translate address (this
> can be optimized to O(logn) using segment tree, or even better using a
> cache, sorry, it's just another problem), so we should maintain as few files
> as possible. (new way can achieve this by used with --socket-mem,
> --huge-dir)
> e. numa aware is not required (and it's complex). (new way can solve this
> without promise)
> 
> In all, this new way seems great for me.
> 
> Another thing is if "go through the same process to handle normal huge page
> initilization", my consideration is: RTE_EAL_SINGLE_FILE_SEGMENTS goes such
> way to maximize code reuse. But the new way has few common code with
> original ways. And mixing these options together leads to bad readability.
> How do you think?

Indeed. I've already found that the code is a bit hard to read, due to
many "#ifdef ... #else .. #endif" blocks, for RTE_EAL_SINGLE_FILE_SEGMENTS
as well as some special archs.

Therefore, I would suggest to do it as below: add another option based
on the SINGLE_FILE_SEGMENTS implementation.

I mean SINGLE_FILE_SEGMENTS already tries to generate as few files as
possible. If we add another option, say --no-sort (or --no-phys-continuity),
we could add just few lines of code to let it generate one file for
each huge page size (if we don't consider the numa affinity).

> >
> >BTW, since we already have SINGLE_FILE_SEGMENTS (config) option, adding
> >another option --single-file looks really confusing to me.
> >
> >To me, maybe you could base the SINGLE_FILE_SEGMENTS option, and add
> >another option, say --no-sort (I confess this name sucks, but you get
> >my point). With that, we could make sure to create as least huge page
> >files as possible, to fit your case.
> 
> This is a great advice. So how do you think of --converged, or
> --no-scattered-mem, or any better idea?

TBH, none of them looks great to me, either. But I have no better
options. Well, --no-phys-continuity looks like the best option to
me so far :)

	--yliu

> 
> Thanks for valuable input.
> 
> Jianfeng
> 
> >
> >	--yliu
  
Panu Matilainen March 8, 2016, 8:49 a.m. UTC | #4
On 03/07/2016 03:13 PM, Yuanhan Liu wrote:
> CC'ed EAL hugepage maintainer, which is something you should do when
> send a patch.
>
> On Fri, Feb 05, 2016 at 07:20:24PM +0800, Jianfeng Tan wrote:
>> Originally, there're two cons in using hugepage: a. needs root
>> privilege to touch /proc/self/pagemap, which is a premise to
>> alllocate physically contiguous memseg; b. possibly too many
>> hugepage file are created, especially used with 2M hugepage.
>>
>> For virtual devices, they don't care about physical-contiguity
>> of allocated hugepages at all. Option --single-file is to
>> provide a way to allocate all hugepages into single mem-backed
>> file.
>>
>> Known issue:
>> a. single-file option relys on kernel to allocate numa-affinitive
>> memory.
>> b. possible ABI break, originally, --no-huge uses anonymous memory
>> instead of file-backed way to create memory.
>>
>> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ...
>> @@ -956,6 +961,16 @@ eal_check_common_options(struct internal_config *internal_cfg)
>>   			"be specified together with --"OPT_NO_HUGE"\n");
>>   		return -1;
>>   	}
>> +	if (internal_cfg->single_file && internal_cfg->force_sockets == 1) {
>> +		RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE" cannot "
>> +			"be specified together with --"OPT_SOCKET_MEM"\n");
>> +		return -1;
>> +	}
>> +	if (internal_cfg->single_file && internal_cfg->hugepage_unlink) {
>> +		RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot "
>> +			"be specified together with --"OPT_SINGLE_FILE"\n");
>> +		return -1;
>> +	}
>
> The two limitation doesn't make sense to me.
>
>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> index 6008533..68ef49a 100644
>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> @@ -1102,20 +1102,54 @@ rte_eal_hugepage_init(void)
>>   	/* get pointer to global configuration */
>>   	mcfg = rte_eal_get_configuration()->mem_config;
>>
>> -	/* hugetlbfs can be disabled */
>> -	if (internal_config.no_hugetlbfs) {
>> -		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
>> -				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
>> +	/* when hugetlbfs is disabled or single-file option is specified */
>> +	if (internal_config.no_hugetlbfs || internal_config.single_file) {
>> +		int fd;
>> +		uint64_t pagesize;
>> +		unsigned socket_id = rte_socket_id();
>> +		char filepath[MAX_HUGEPAGE_PATH];
>> +
>> +		if (internal_config.no_hugetlbfs) {
>> +			eal_get_hugefile_path(filepath, sizeof(filepath),
>> +					      "/dev/shm", 0);
>> +			pagesize = RTE_PGSIZE_4K;
>> +		} else {
>> +			struct hugepage_info *hpi;
>> +
>> +			hpi = &internal_config.hugepage_info[0];
>> +			eal_get_hugefile_path(filepath, sizeof(filepath),
>> +					      hpi->hugedir, 0);
>> +			pagesize = hpi->hugepage_sz;
>> +		}
>> +		fd = open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
>> +		if (fd < 0) {
>> +			RTE_LOG(ERR, EAL, "%s: open %s failed: %s\n",
>> +				__func__, filepath, strerror(errno));
>> +			return -1;
>> +		}
>> +
>> +		if (ftruncate(fd, internal_config.memory) < 0) {
>> +			RTE_LOG(ERR, EAL, "ftuncate %s failed: %s\n",
>> +				filepath, strerror(errno));
>> +			return -1;
>> +		}
>> +
>> +		addr = mmap(NULL, internal_config.memory,
>> +			    PROT_READ | PROT_WRITE,
>> +			    MAP_SHARED | MAP_POPULATE, fd, 0);
>>   		if (addr == MAP_FAILED) {
>> -			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
>> -					strerror(errno));
>> +			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n",
>> +				__func__, strerror(errno));
>>   			return -1;
>>   		}
>>   		mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
>>   		mcfg->memseg[0].addr = addr;
>> -		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
>> +		mcfg->memseg[0].hugepage_sz = pagesize;
>>   		mcfg->memseg[0].len = internal_config.memory;
>> -		mcfg->memseg[0].socket_id = 0;
>> +		mcfg->memseg[0].socket_id = socket_id;
>
> I saw quite few issues:
>
> - Assume I have a system with two hugepage sizes: 1G (x4) and 2M (x512),
>    mounted at /dev/hugepages and /mnt, respectively.
>
>    Here we then got an 5G internal_config.memory, and your code will
>    try to mmap 5G on the first mount point (/dev/hugepages) due to the
>    hardcode logic in your code:
>
>        hpi = &internal_config.hugepage_info[0];
>        eal_get_hugefile_path(filepath, sizeof(filepath),
>        		      hpi->hugedir, 0);
>
>    But it has 4G in total, therefore, it will fails.
>
> - As you stated, socket_id is hardcoded, which could be wrong.
>
> - As stated in above, the option limitation doesn't seem right to me.
>
>    I mean, --single-file should be able to work with --socket-mem option
>    in semantic.
>
>
> And I have been thinking how to deal with those issues properly, and a
> __very immature__ solution come to my mind (which could be simply not
> working), but anyway, here is FYI: we go through the same process to
> handle normal huge page initilization to --single-file option as well.
> But we take different actions or no actions at all at some stages when
> that option is given, which is a bit similiar with the way of handling
> RTE_EAL_SINGLE_FILE_SEGMENTS.
>
> And we create one hugepage file for each node, each page size. For a
> system like mine above (2 nodes), it may populate files like following:
>
> - 1G x 2 on node0
> - 1G x 2 on node1
> - 2M x 256 on node0
> - 2M x 256 on node1
>
> That could normally fit your case. Though 4 nodes looks like the maximum
> node number, --socket-mem option may relieve the limit a bit.
>
> And if we "could" not care the socket_id being set correctly, we could
> just simply allocate one file for each hugepage size. That would work
> well for your container enabling.
>
> BTW, since we already have SINGLE_FILE_SEGMENTS (config) option, adding
> another option --single-file looks really confusing to me.
>
> To me, maybe you could base the SINGLE_FILE_SEGMENTS option, and add
> another option, say --no-sort (I confess this name sucks, but you get
> my point). With that, we could make sure to create as least huge page
> files as possible, to fit your case.

Note that SINGLE_FILE_SEGMENTS is a nasty hack that only the IVSHMEM 
config uses, getting rid of it (by replacing with a runtime switch) 
would be great. OTOH IVSHMEM itself seems to have fallen out of the 
fashion since the memnic driver is unmaintained and broken since dpdk 
2.0... CC'ing the IVSHMEM maintainer in case he has thoughts on this.

	- Panu -
  
Yuanhan Liu March 8, 2016, 9:04 a.m. UTC | #5
On Tue, Mar 08, 2016 at 10:49:30AM +0200, Panu Matilainen wrote:
> On 03/07/2016 03:13 PM, Yuanhan Liu wrote:
> >To me, maybe you could base the SINGLE_FILE_SEGMENTS option, and add
> >another option, say --no-sort (I confess this name sucks, but you get
> >my point). With that, we could make sure to create as least huge page
> >files as possible, to fit your case.
> 
> Note that SINGLE_FILE_SEGMENTS is a nasty hack that only the IVSHMEM config
> uses, getting rid of it (by replacing with a runtime switch) would be great.

Can't agree more.

BTW, FYI, Jianfeng and I had a private talk, and we came to agree that
it might be better to handle it outside the normal huge page init stage,
just like this patch does, but adding the support of multiple huge page
sizes. Let's not add more messy code there.

	--yliu

> OTOH IVSHMEM itself seems to have fallen out of the fashion since the memnic
> driver is unmaintained and broken since dpdk 2.0... CC'ing the IVSHMEM
> maintainer in case he has thoughts on this.
  
Thomas Monjalon March 8, 2016, 10:30 a.m. UTC | #6
2016-03-08 17:04, Yuanhan Liu:
> On Tue, Mar 08, 2016 at 10:49:30AM +0200, Panu Matilainen wrote:
> > On 03/07/2016 03:13 PM, Yuanhan Liu wrote:
> > >To me, maybe you could base the SINGLE_FILE_SEGMENTS option, and add
> > >another option, say --no-sort (I confess this name sucks, but you get
> > >my point). With that, we could make sure to create as least huge page
> > >files as possible, to fit your case.
> > 
> > Note that SINGLE_FILE_SEGMENTS is a nasty hack that only the IVSHMEM config
> > uses, getting rid of it (by replacing with a runtime switch) would be great.
> 
> Can't agree more.

+1

> BTW, FYI, Jianfeng and I had a private talk, and we came to agree that
> it might be better to handle it outside the normal huge page init stage,
> just like this patch does, but adding the support of multiple huge page
> sizes. Let's not add more messy code there.
> 
> 	--yliu
> 
> > OTOH IVSHMEM itself seems to have fallen out of the fashion since the memnic
> > driver is unmaintained and broken since dpdk 2.0... CC'ing the IVSHMEM
> > maintainer in case he has thoughts on this.

The ivshmem config was not used for memnic which was using ivshmem only for
data path.
CONFIG_RTE_LIBRTE_IVSHMEM and CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS are more
about full memory sharing.
I have the feeling it could be dropped.
It there are some users, I'd like to see a justification and a rework to
remove these build options.
  
Anatoly Burakov March 8, 2016, 10:57 a.m. UTC | #7
Hi Thomas,

> 2016-03-08 17:04, Yuanhan Liu:
> > On Tue, Mar 08, 2016 at 10:49:30AM +0200, Panu Matilainen wrote:
> > > On 03/07/2016 03:13 PM, Yuanhan Liu wrote:
> > > >To me, maybe you could base the SINGLE_FILE_SEGMENTS option, and
> > > >add another option, say --no-sort (I confess this name sucks, but
> > > >you get my point). With that, we could make sure to create as least
> > > >huge page files as possible, to fit your case.
> > >
> > > Note that SINGLE_FILE_SEGMENTS is a nasty hack that only the
> IVSHMEM
> > > config uses, getting rid of it (by replacing with a runtime switch) would be
> great.
> >
> > Can't agree more.
> 
> +1
> 
> > BTW, FYI, Jianfeng and I had a private talk, and we came to agree that
> > it might be better to handle it outside the normal huge page init
> > stage, just like this patch does, but adding the support of multiple
> > huge page sizes. Let's not add more messy code there.
> >
> > 	--yliu
> >
> > > OTOH IVSHMEM itself seems to have fallen out of the fashion since
> > > the memnic driver is unmaintained and broken since dpdk 2.0...
> > > CC'ing the IVSHMEM maintainer in case he has thoughts on this.
> 
> The ivshmem config was not used for memnic which was using ivshmem only
> for data path.
> CONFIG_RTE_LIBRTE_IVSHMEM and
> CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS are more about full memory
> sharing.
> I have the feeling it could be dropped.
> It there are some users, I'd like to see a justification and a rework to remove
> these build options.

Just to add my opinion to it - if there are no users for both of these, I'd like for those to be removed as well. Less maintenance is always better than more maintenance, especially for things that no one uses :)

Thanks,
Anatoly
  
Jianfeng Tan March 9, 2016, 2:44 p.m. UTC | #8
Hi,

On 3/8/2016 10:44 AM, Yuanhan Liu wrote:
> On Tue, Mar 08, 2016 at 09:55:10AM +0800, Tan, Jianfeng wrote:
>> Hi Yuanhan,
>>
>> On 3/7/2016 9:13 PM, Yuanhan Liu wrote:
>>> CC'ed EAL hugepage maintainer, which is something you should do when
>>> send a patch.
>> Thanks for doing this.
>>
>>> On Fri, Feb 05, 2016 at 07:20:24PM +0800, Jianfeng Tan wrote:
>>>> Originally, there're two cons in using hugepage: a. needs root
>>>> privilege to touch /proc/self/pagemap, which is a premise to
>>>> alllocate physically contiguous memseg; b. possibly too many
>>>> hugepage file are created, especially used with 2M hugepage.
>>>>
>>>> For virtual devices, they don't care about physical-contiguity
>>>> of allocated hugepages at all. Option --single-file is to
>>>> provide a way to allocate all hugepages into single mem-backed
>>>> file.
>>>>
>>>> Known issue:
>>>> a. single-file option relys on kernel to allocate numa-affinitive
>>>> memory.
>>>> b. possible ABI break, originally, --no-huge uses anonymous memory
>>>> instead of file-backed way to create memory.
>>>>
>>>> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>> ...
>>>> @@ -956,6 +961,16 @@ eal_check_common_options(struct internal_config *internal_cfg)
>>>>   			"be specified together with --"OPT_NO_HUGE"\n");
>>>>   		return -1;
>>>>   	}
>>>> +	if (internal_cfg->single_file && internal_cfg->force_sockets == 1) {
>>>> +		RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE" cannot "
>>>> +			"be specified together with --"OPT_SOCKET_MEM"\n");
>>>> +		return -1;
>>>> +	}
>>>> +	if (internal_cfg->single_file && internal_cfg->hugepage_unlink) {
>>>> +		RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot "
>>>> +			"be specified together with --"OPT_SINGLE_FILE"\n");
>>>> +		return -1;
>>>> +	}
>>> The two limitation doesn't make sense to me.
>> For the force_sockets option, my original thought on --single-file option
>> is, we don't sort those pages (require root/cap_sys_admin) and even don't
>> look up numa information because it may contain both sockets' memory.
>>
>> For the hugepage_unlink option, those hugepage files get closed in the end
>> of memory initialization, if we even unlink those hugepage files, so we
>> cannot share those with other processes (say backend).
> Yeah, I know how the two limitations come, from your implementation. I
> was just wondering if they both are __truly__ the limitations. I mean,
> can we get rid of them somehow?
>
> For --socket-mem option, if we can't handle it well, or if we could
> ignore the socket_id for allocated huge page, yes, the limitation is
> a true one.

To make it work with --socket-mem option, we need to call 
mbind()/set_mempolicy(), which leads to including "LDFLAGS += -lnuma" a 
mandatory line in mk file. Don't know if it's  acceptable to bring in 
dependency on libnuma.so?


>
> But for the second option, no, we should be able to co-work it with
> well. One extra action is you should not invoke "close(fd)" for those
> huge page files. And then you can get all the informations as I stated
> in a reply to your 2nd patch.

As discussed yesterday, I think there's a open files limitation for each 
process, if we keep those FDs open, it will bring failure to those 
existing programs. If others treat it as a problem?
...
>>> BTW, since we already have SINGLE_FILE_SEGMENTS (config) option, adding
>>> another option --single-file looks really confusing to me.
>>>
>>> To me, maybe you could base the SINGLE_FILE_SEGMENTS option, and add
>>> another option, say --no-sort (I confess this name sucks, but you get
>>> my point). With that, we could make sure to create as least huge page
>>> files as possible, to fit your case.
>> This is a great advice. So how do you think of --converged, or
>> --no-scattered-mem, or any better idea?
> TBH, none of them looks great to me, either. But I have no better
> options. Well, --no-phys-continuity looks like the best option to
> me so far :)

I'd like to make it a little more concise, how about --no-phys-contig? 
In addition, Yuanhan thinks there's still no literal meaning that just 
create one file for each hugetlbfs (or socket). But from my side, 
there's an indirect meaning, because if no need to promise 
physically-contig, then no need to create hugepages one by one. Anyone 
can give your option here? Thanks.

Thanks,
Jianfeng
  
Traynor, Kevin March 14, 2016, 1:53 p.m. UTC | #9
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Thomas Monjalon
> Sent: Tuesday, March 8, 2016 10:31 AM
> To: dev@dpdk.org
> Cc: nakajima.yoshihiro@lab.ntt.co.jp; mst@redhat.com; p.fedin@samsung.com;
> ann.zhuangyanying@huawei.com
> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mem: add --single-file to create
> single mem-backed file
> 
> 2016-03-08 17:04, Yuanhan Liu:
> > On Tue, Mar 08, 2016 at 10:49:30AM +0200, Panu Matilainen wrote:
> > > On 03/07/2016 03:13 PM, Yuanhan Liu wrote:
> > > >To me, maybe you could base the SINGLE_FILE_SEGMENTS option, and add
> > > >another option, say --no-sort (I confess this name sucks, but you get
> > > >my point). With that, we could make sure to create as least huge page
> > > >files as possible, to fit your case.
> > >
> > > Note that SINGLE_FILE_SEGMENTS is a nasty hack that only the IVSHMEM
> config
> > > uses, getting rid of it (by replacing with a runtime switch) would be
> great.
> >
> > Can't agree more.
> 
> +1
> 
> > BTW, FYI, Jianfeng and I had a private talk, and we came to agree that
> > it might be better to handle it outside the normal huge page init stage,
> > just like this patch does, but adding the support of multiple huge page
> > sizes. Let's not add more messy code there.
> >
> > 	--yliu
> >
> > > OTOH IVSHMEM itself seems to have fallen out of the fashion since the
> memnic
> > > driver is unmaintained and broken since dpdk 2.0... CC'ing the IVSHMEM
> > > maintainer in case he has thoughts on this.
> 
> The ivshmem config was not used for memnic which was using ivshmem only for
> data path.
> CONFIG_RTE_LIBRTE_IVSHMEM and CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS are more
> about full memory sharing.
> I have the feeling it could be dropped.
> It there are some users, I'd like to see a justification and a rework to
> remove these build options.

Just to clarify - is this suggesting the removal of the IVSHMEM library itself,
or just some of the config options?

The reason I ask is that although we don't currently use it in OVS with DPDK,
I've seen at least one person using it in conjunction with the ring interface.
There may be others, so I want to cross-post if there's a deprecation discussion. 

Kevin.
  
Thomas Monjalon March 14, 2016, 2:45 p.m. UTC | #10
2016-03-14 13:53, Traynor, Kevin:
> From: Thomas Monjalon
> > 2016-03-08 17:04, Yuanhan Liu:
> > > On Tue, Mar 08, 2016 at 10:49:30AM +0200, Panu Matilainen wrote:
> > > > On 03/07/2016 03:13 PM, Yuanhan Liu wrote:
> > > > Note that SINGLE_FILE_SEGMENTS is a nasty hack that only the IVSHMEM
> > config
> > > > uses, getting rid of it (by replacing with a runtime switch) would be
> > great.
> > >
> > > Can't agree more.
> > 
> > +1
> > 
> > > > OTOH IVSHMEM itself seems to have fallen out of the fashion since the
> > memnic
> > > > driver is unmaintained and broken since dpdk 2.0... CC'ing the IVSHMEM
> > > > maintainer in case he has thoughts on this.
> > 
> > The ivshmem config was not used for memnic which was using ivshmem only for
> > data path.
> > CONFIG_RTE_LIBRTE_IVSHMEM and CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS are more
> > about full memory sharing.
> > I have the feeling it could be dropped.
> > It there are some users, I'd like to see a justification and a rework to
> > remove these build options.
> 
> Just to clarify - is this suggesting the removal of the IVSHMEM library itself,
> or just some of the config options?

I have no strong opinion about the library.
About the config options, yes they should be removed. Note that they are not
documented, so we don't really know the motivation to have them.

> The reason I ask is that although we don't currently use it in OVS with DPDK,
> I've seen at least one person using it in conjunction with the ring interface.
> There may be others, so I want to cross-post if there's a deprecation discussion. 

Thank you for sharing.
  
Traynor, Kevin March 14, 2016, 6:21 p.m. UTC | #11
> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Monday, March 14, 2016 2:45 PM
> To: Traynor, Kevin <kevin.traynor@intel.com>
> Cc: dev@dpdk.org; nakajima.yoshihiro@lab.ntt.co.jp; mst@redhat.com;
> p.fedin@samsung.com; ann.zhuangyanying@huawei.com
> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mem: add --single-file to create
> single mem-backed file
> 
> 2016-03-14 13:53, Traynor, Kevin:
> > From: Thomas Monjalon
> > > 2016-03-08 17:04, Yuanhan Liu:
> > > > On Tue, Mar 08, 2016 at 10:49:30AM +0200, Panu Matilainen wrote:
> > > > > On 03/07/2016 03:13 PM, Yuanhan Liu wrote:
> > > > > Note that SINGLE_FILE_SEGMENTS is a nasty hack that only the IVSHMEM
> > > config
> > > > > uses, getting rid of it (by replacing with a runtime switch) would be
> > > great.
> > > >
> > > > Can't agree more.
> > >
> > > +1
> > >
> > > > > OTOH IVSHMEM itself seems to have fallen out of the fashion since the
> > > memnic
> > > > > driver is unmaintained and broken since dpdk 2.0... CC'ing the
> IVSHMEM
> > > > > maintainer in case he has thoughts on this.
> > >
> > > The ivshmem config was not used for memnic which was using ivshmem only
> for
> > > data path.
> > > CONFIG_RTE_LIBRTE_IVSHMEM and CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS are
> more
> > > about full memory sharing.
> > > I have the feeling it could be dropped.
> > > It there are some users, I'd like to see a justification and a rework to
> > > remove these build options.
> >
> > Just to clarify - is this suggesting the removal of the IVSHMEM library
> itself,
> > or just some of the config options?
> 
> I have no strong opinion about the library.
> About the config options, yes they should be removed. Note that they are not
> documented, so we don't really know the motivation to have them.

ok, thanks for clarifying. As there's no imminent plans to remove the library,
I won't cross post. 

> 
> > The reason I ask is that although we don't currently use it in OVS with
> DPDK,
> > I've seen at least one person using it in conjunction with the ring
> interface.
> > There may be others, so I want to cross-post if there's a deprecation
> discussion.
> 
> Thank you for sharing.
  

Patch

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 29942ea..65bccbd 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@  eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_SINGLE_FILE,       0, NULL, OPT_SINGLE_FILE_NUM      },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -897,6 +898,10 @@  eal_parse_common_option(int opt, const char *optarg,
 		}
 		break;
 
+	case OPT_SINGLE_FILE_NUM:
+		conf->single_file = 1;
+		break;
+
 	/* don't know what to do, leave this to caller */
 	default:
 		return 1;
@@ -956,6 +961,16 @@  eal_check_common_options(struct internal_config *internal_cfg)
 			"be specified together with --"OPT_NO_HUGE"\n");
 		return -1;
 	}
+	if (internal_cfg->single_file && internal_cfg->force_sockets == 1) {
+		RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE" cannot "
+			"be specified together with --"OPT_SOCKET_MEM"\n");
+		return -1;
+	}
+	if (internal_cfg->single_file && internal_cfg->hugepage_unlink) {
+		RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot "
+			"be specified together with --"OPT_SINGLE_FILE"\n");
+		return -1;
+	}
 
 	if (internal_cfg->no_hugetlbfs && internal_cfg->hugepage_unlink) {
 		RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot "
@@ -994,6 +1009,8 @@  eal_common_usage(void)
 	       "  -n CHANNELS         Number of memory channels\n"
 	       "  -m MB               Memory to allocate (see also --"OPT_SOCKET_MEM")\n"
 	       "  -r RANKS            Force number of memory ranks (don't detect)\n"
+	       "  --"OPT_SINGLE_FILE" Create just single file for shared memory, and \n"
+	       "                      do not promise physical contiguity of memseg\n"
 	       "  -b, --"OPT_PCI_BLACKLIST" Add a PCI device in black list.\n"
 	       "                      Prevent EAL from using this PCI device. The argument\n"
 	       "                      format is <domain:bus:devid.func>.\n"
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..9117ed9 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -61,6 +61,7 @@  struct hugepage_info {
  */
 struct internal_config {
 	volatile size_t memory;           /**< amount of asked memory */
+	volatile unsigned single_file;    /**< mmap all hugepages in single file */
 	volatile unsigned force_nchannel; /**< force number of channels */
 	volatile unsigned force_nrank;    /**< force number of ranks */
 	volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..e5da14a 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@  enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_SINGLE_FILE       "single-file"
+	OPT_SINGLE_FILE_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 635ec36..2bc84f7 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -790,6 +790,8 @@  rte_eal_init(int argc, char **argv)
 		rte_panic("Cannot init IVSHMEM\n");
 #endif
 
+	eal_thread_init_master(rte_config.master_lcore);
+
 	if (rte_eal_memory_init() < 0)
 		rte_panic("Cannot init memory\n");
 
@@ -823,8 +825,6 @@  rte_eal_init(int argc, char **argv)
 	if (eal_plugins_init() < 0)
 		rte_panic("Cannot init plugins\n");
 
-	eal_thread_init_master(rte_config.master_lcore);
-
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
 	RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 6008533..68ef49a 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1102,20 +1102,54 @@  rte_eal_hugepage_init(void)
 	/* get pointer to global configuration */
 	mcfg = rte_eal_get_configuration()->mem_config;
 
-	/* hugetlbfs can be disabled */
-	if (internal_config.no_hugetlbfs) {
-		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
-				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+	/* when hugetlbfs is disabled or single-file option is specified */
+	if (internal_config.no_hugetlbfs || internal_config.single_file) {
+		int fd;
+		uint64_t pagesize;
+		unsigned socket_id = rte_socket_id();
+		char filepath[MAX_HUGEPAGE_PATH];
+
+		if (internal_config.no_hugetlbfs) {
+			eal_get_hugefile_path(filepath, sizeof(filepath),
+					      "/dev/shm", 0);
+			pagesize = RTE_PGSIZE_4K;
+		} else {
+			struct hugepage_info *hpi;
+
+			hpi = &internal_config.hugepage_info[0];
+			eal_get_hugefile_path(filepath, sizeof(filepath),
+					      hpi->hugedir, 0);
+			pagesize = hpi->hugepage_sz;
+		}
+		fd = open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+		if (fd < 0) {
+			RTE_LOG(ERR, EAL, "%s: open %s failed: %s\n",
+				__func__, filepath, strerror(errno));
+			return -1;
+		}
+
+		if (ftruncate(fd, internal_config.memory) < 0) {
+			RTE_LOG(ERR, EAL, "ftuncate %s failed: %s\n",
+				filepath, strerror(errno));
+			return -1;
+		}
+
+		addr = mmap(NULL, internal_config.memory,
+			    PROT_READ | PROT_WRITE,
+			    MAP_SHARED | MAP_POPULATE, fd, 0);
 		if (addr == MAP_FAILED) {
-			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
-					strerror(errno));
+			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n",
+				__func__, strerror(errno));
 			return -1;
 		}
 		mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
 		mcfg->memseg[0].addr = addr;
-		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
+		mcfg->memseg[0].hugepage_sz = pagesize;
 		mcfg->memseg[0].len = internal_config.memory;
-		mcfg->memseg[0].socket_id = 0;
+		mcfg->memseg[0].socket_id = socket_id;
+
+		close(fd);
+
 		return 0;
 	}