[dpdk-dev] [PATCH v3 1/1] vfio: modify spapr iommu support to use static window sizing
Burakov, Anatoly
anatoly.burakov at intel.com
Thu Sep 17 13:13:11 CEST 2020
On 10-Aug-20 10:07 PM, David Christensen wrote:
> The SPAPR IOMMU requires that a DMA window size be defined before memory
> can be mapped for DMA. Current code dynamically modifies the DMA window
> size in response to every new memory allocation which is potentially
> dangerous because all existing mappings need to be unmapped/remapped in
> order to resize the DMA window, leaving hardware holding IOVA addresses
> that are temporarily unmapped. The new SPAPR code statically assigns
> the DMA window size on first use, using the largest physical memory
> memory address when IOVA=PA and the highest existing memseg virtual
> address when IOVA=VA.
>
> Signed-off-by: David Christensen <drc at linux.vnet.ibm.com>
> ---
<snip>
> +struct spapr_size_walk_param {
> + uint64_t max_va;
> + uint64_t page_sz;
> + int external;
> +};
> +
> +/*
> + * In order to set the DMA window size required for the SPAPR IOMMU
> + * we need to walk the existing virtual memory allocations as well as
> + * find the hugepage size used.
> + */
> static int
> -vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
> - const struct rte_memseg *ms, void *arg)
> +vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg)
> {
> - int *vfio_container_fd = arg;
> + struct spapr_size_walk_param *param = arg;
> + uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len;
>
> - /* skip external memory that isn't a heap */
> - if (msl->external && !msl->heap)
> - return 0;
> + if (msl->external) {
> + param->external++;
> + if (!msl->heap)
> + return 0;
> + }
It would be nice to have some comments in the code explaining what we're
skipping and why.
Also, seems that you're using param->external as bool? This is a
non-public API so using stdbool is not an issue here, perhaps replace it
with bool param->has_external?
>
> - /* skip any segments with invalid IOVA addresses */
> - if (ms->iova == RTE_BAD_IOVA)
> - return 0;
> + if (max > param->max_va) {
> + param->page_sz = msl->page_sz;
> + param->max_va = max;
> + }
>
> - return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
> - ms->len, 0);
> + return 0;
> }
>
> -struct spapr_walk_param {
> - uint64_t window_size;
> - uint64_t hugepage_sz;
> -};
> -
> +/*
> + * The SPAPRv2 IOMMU supports 2 DMA windows with starting
> + * address at 0 or 1<<59. By default, a DMA window is set
> + * at address 0, 2GB long, with a 4KB page. For DPDK we
> + * must remove the default window and setup a new DMA window
> + * based on the hugepage size and memory requirements of
> + * the application before we can map memory for DMA.
> + */
> static int
> -vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
> - const struct rte_memseg *ms, void *arg)
> +spapr_dma_win_size(void)
> {
> - struct spapr_walk_param *param = arg;
> - uint64_t max = ms->iova + ms->len;
> + struct spapr_size_walk_param param;
>
> - /* skip external memory that isn't a heap */
> - if (msl->external && !msl->heap)
> + /* only create DMA window once */
> + if (spapr_dma_win_len > 0)
> return 0;
>
> - /* skip any segments with invalid IOVA addresses */
> - if (ms->iova == RTE_BAD_IOVA)
> - return 0;
> + /* walk the memseg list to find the page size/max VA address */
> + memset(¶m, 0, sizeof(param));
> + if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) {
> + RTE_LOG(ERR, EAL, "Failed to walk memseg list for DMA "
> + "window size\n");
> + return -1;
> + }
> +
> + /* We can't be sure if DMA window covers external memory */
> + if (param.external > 0)
> + RTE_LOG(WARNING, EAL, "Detected external memory which may "
> + "not be managed by the IOMMU\n");
> +
> + /* find the maximum IOVA address for setting the DMA window size */
> + if (rte_eal_iova_mode() == RTE_IOVA_PA) {
> + static const char proc_iomem[] = "/proc/iomem";
> + static const char str_sysram[] = "System RAM";
> + uint64_t start, end, max = 0;
> + char *line = NULL;
> + char *dash, *space;
> + size_t line_len;
> +
> + /*
> + * Example "System RAM" in /proc/iomem:
> + * 00000000-1fffffffff : System RAM
> + * 200000000000-201fffffffff : System RAM
> + */
> + FILE *fd = fopen(proc_iomem, "r");
> + if (fd == NULL) {
> + RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_iomem);
> + return -1;
> + }
> + /* Scan /proc/iomem for the highest PA in the system */
> + while (getline(&line, &line_len, fd) != -1) {
> + if (strstr(line, str_sysram) == NULL)
> + continue;
> +
> + space = strstr(line, " ");
> + dash = strstr(line, "-");
> +
> + /* Validate the format of the memory string */
> + if (space == NULL || dash == NULL || space < dash) {
> + RTE_LOG(ERR, EAL, "Can't parse line \"%s\" in "
> + "file %s\n", line, proc_iomem);
> + continue;
> + }
> +
> + start = strtoull(line, NULL, 16);
> + end = strtoull(dash + 1, NULL, 16);
> + RTE_LOG(DEBUG, EAL, "Found system RAM from 0x%"
> + PRIx64 " to 0x%" PRIx64 "\n", start, end);
> + if (end > max)
> + max = end;
> + }
> + free(line);
> + fclose(fd);
I would've put all of this file reading business into a separate
function, as otherwise it's a bit hard to follow the mix of file ops and
using the results. Something like
value = get_value_from_iomem();
if (value > ...)
...
is much easier on the eyes :)
>
> - if (max > param->window_size) {
> - param->hugepage_sz = ms->hugepage_sz;
> - param->window_size = max;
> + if (max == 0) {
> + RTE_LOG(ERR, EAL, "Failed to find valid \"System RAM\" "
> + "entry in file %s\n", proc_iomem);
> + return -1;
> + }
> +
> + spapr_dma_win_len = rte_align64pow2(max + 1);
> + RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%"
> + PRIx64 "\n", spapr_dma_win_len);
> + } else if (rte_eal_iova_mode() == RTE_IOVA_VA) {
> + RTE_LOG(DEBUG, EAL, "Highest VA address in memseg list is 0x%"
> + PRIx64 "\n", param.max_va);
> + spapr_dma_win_len = rte_align64pow2(param.max_va);
> + RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%"
> + PRIx64 "\n", spapr_dma_win_len);
> + } else {
> + RTE_LOG(ERR, EAL, "Unsupported IOVA mode\n");
> + return -1;
> }
>
> + spapr_dma_win_page_sz = param.page_sz;
> + rte_mem_set_dma_mask(__builtin_ctzll(spapr_dma_win_len));
> return 0;
> }
>
> static int
> -vfio_spapr_create_new_dma_window(int vfio_container_fd,
> - struct vfio_iommu_spapr_tce_create *create) {
> +vfio_spapr_create_dma_window(int vfio_container_fd)
> +{
> + struct vfio_iommu_spapr_tce_create create = {
> + .argsz = sizeof(create), };
> struct vfio_iommu_spapr_tce_remove remove = {
> - .argsz = sizeof(remove),
> - };
> + .argsz = sizeof(remove), };
> struct vfio_iommu_spapr_tce_info info = {
> - .argsz = sizeof(info),
> - };
> + .argsz = sizeof(info), };
> int ret;
>
> - /* query spapr iommu info */
> + ret = spapr_dma_win_size();
> + if (ret < 0)
> + return ret;
> +
> ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
> if (ret) {
> - RTE_LOG(ERR, EAL, " cannot get iommu info, "
> - "error %i (%s)\n", errno, strerror(errno));
Here and in other similar places, no need to split strings into multiline.
Overall, since these changes are confined to PPC64 i can't really test
these, but with the above changes:
Reviewed-by: Anatoly Burakov <anatoly.burakov at intel.com>
--
Thanks,
Anatoly
More information about the dev
mailing list