[PATCH v11 1/4] lib: add generic support for reading PMU events

Konstantin Ananyev konstantin.v.ananyev at yandex.ru
Tue Feb 21 03:17:07 CET 2023


> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> i.e run dedicated tasks. In such cases one cannot use standard
> perf utility without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszynski at marvell.com>
> Acked-by: Morten Brørup <mb at smartsharesystems.com>

Few more comments/questions below.


> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c
> new file mode 100644
> index 0000000000..950f999cb7
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.c
> @@ -0,0 +1,460 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2023 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_per_lcore.h>
> +#include <rte_pmu.h>
> +#include <rte_spinlock.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
> +
> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> +struct rte_pmu rte_pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +	return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
> +{
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask)
> +{
> +	char path[PATH_MAX];
> +	char *config = NULL;
> +	int high, low, ret;
> +	FILE *fp;
> +
> +	*num = *mask = 0;
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
> +	fp = fopen(path, "r");
> +	if (fp == NULL)
> +		return -errno;
> +
> +	errno = 0;
> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +	if (ret < 2) {
> +		ret = -ENODATA;
> +		goto out;
> +	}
> +	if (errno) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ret == 2)
> +		high = low;
> +
> +	*mask = GENMASK_ULL(high, low);
> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> +	*num = config[strlen(config) - 1];
> +	*num = isdigit(*num) ? *num - '0' : 0;
> +
> +	ret = 0;
> +out:
> +	free(config);
> +	fclose(fp);
> +
> +	return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3])
> +{
> +	char *token, *term;
> +	int num, ret, val;
> +	uint64_t mask;
> +
> +	config[0] = config[1] = config[2] = 0;
> +
> +	token = strtok(buf, ",");
> +	while (token) {
> +		errno = 0;
> +		/* <term>=<value> */
> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +		if (ret < 1)
> +			return -ENODATA;
> +		if (errno)
> +			return -errno;
> +		if (ret == 1)
> +			val = 1;
> +
> +		ret = get_term_format(term, &num, &mask);
> +		free(term);
> +		if (ret)
> +			return ret;
> +
> +		config[num] |= FIELD_PREP(mask, val);
> +		token = strtok(NULL, ",");
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3])
> +{
> +	char path[PATH_MAX], buf[BUFSIZ];
> +	FILE *fp;
> +	int ret;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> +	fp = fopen(path, "r");
> +	if (fp == NULL)
> +		return -errno;
> +
> +	ret = fread(buf, 1, sizeof(buf), fp);
> +	if (ret == 0) {
> +		fclose(fp);
> +
> +		return -EINVAL;
> +	}
> +	fclose(fp);
> +	buf[ret] = '\0';
> +
> +	return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int group_fd)
> +{
> +	struct perf_event_attr attr = {
> +		.size = sizeof(struct perf_event_attr),
> +		.type = PERF_TYPE_RAW,
> +		.exclude_kernel = 1,
> +		.exclude_hv = 1,
> +		.disabled = 1,
> +	};
> +
> +	pmu_arch_fixup_config(config);
> +
> +	attr.config = config[0];
> +	attr.config1 = config[1];
> +	attr.config2 = config[2];
> +
> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
> +}
> +
> +static int
> +open_events(struct rte_pmu_event_group *group)
> +{
> +	struct rte_pmu_event *event;
> +	uint64_t config[3];
> +	int num = 0, ret;
> +
> +	/* group leader gets created first, with fd = -1 */
> +	group->fds[0] = -1;
> +
> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +		ret = get_event_config(event->name, config);
> +		if (ret)
> +			continue;
> +
> +		ret = do_perf_event_open(config, group->fds[0]);
> +		if (ret == -1) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->fds[event->index] = ret;
> +		num++;
> +	}
> +
> +	return 0;
> +out:
> +	for (--num; num >= 0; num--) {
> +		close(group->fds[num]);
> +		group->fds[num] = -1;
> +	}
> +
> +
> +	return ret;
> +}
> +
> +static int
> +mmap_events(struct rte_pmu_event_group *group)
> +{
> +	long page_size = sysconf(_SC_PAGE_SIZE);
> +	unsigned int i;
> +	void *addr;
> +	int ret;
> +
> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
> +		if (addr == MAP_FAILED) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->mmap_pages[i] = addr;
> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
> +			ret = -EPERM;
> +			goto out;
> +		}
> +	}
> +
> +	return 0;
> +out:
> +	for (; i; i--) {
> +		munmap(group->mmap_pages[i - 1], page_size);
> +		group->mmap_pages[i - 1] = NULL;
> +	}
> +
> +	return ret;
> +}
> +
> +static void
> +cleanup_events(struct rte_pmu_event_group *group)
> +{
> +	unsigned int i;
> +
> +	if (group->fds[0] != -1)
> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> +
> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> +		if (group->mmap_pages[i]) {
> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
> +			group->mmap_pages[i] = NULL;
> +		}
> +
> +		if (group->fds[i] != -1) {
> +			close(group->fds[i]);
> +			group->fds[i] = -1;
> +		}
> +	}
> +
> +	group->enabled = false;
> +}
> +
> +int
> +__rte_pmu_enable_group(void)
> +{
> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> +	int ret;
> +
> +	if (rte_pmu.num_group_events == 0)
> +		return -ENODEV;
> +
> +	ret = open_events(group);
> +	if (ret)
> +		goto out;
> +
> +	ret = mmap_events(group);
> +	if (ret)
> +		goto out;
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	rte_spinlock_lock(&rte_pmu.lock);
> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);

Hmm.. so we insert pointer to TLS variable into the global list?
Wonder what would happen if that thread get terminated?
Can memory from its TLS block get re-used (by other thread or for other 
purposes)?


> +	rte_spinlock_unlock(&rte_pmu.lock);
> +	group->enabled = true;
> +
> +	return 0;
> +
> +out:
> +	cleanup_events(group);
> +
> +	return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +	char path[PATH_MAX];
> +	struct dirent *dent;
> +	const char *name;
> +	DIR *dirp;
> +
> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +	if (dirp == NULL)
> +		return -errno;
> +
> +	while ((dent = readdir(dirp))) {
> +		name = dent->d_name;
> +		if (name[0] == '.')
> +			continue;
> +
> +		/* sysfs entry should either contain cpus or be a cpu */
> +		if (!strcmp(name, "cpu"))
> +			break;
> +
> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> +		if (access(path, F_OK) == 0)
> +			break;
> +	}
> +
> +	if (dent) {
> +		rte_pmu.name = strdup(name);
> +		if (rte_pmu.name == NULL) {
> +			closedir(dirp);
> +
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	closedir(dirp);
> +
> +	return rte_pmu.name ? 0 : -ENODEV;
> +}
> +
> +static struct rte_pmu_event *
> +new_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +
> +	event = calloc(1, sizeof(*event));
> +	if (event == NULL)
> +		goto out;
> +
> +	event->name = strdup(name);
> +	if (event->name == NULL) {
> +		free(event);
> +		event = NULL;
> +	}
> +
> +out:
> +	return event;
> +}
> +
> +static void
> +free_event(struct rte_pmu_event *event)
> +{
> +	free(event->name);
> +	free(event);
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +	char path[PATH_MAX];
> +
> +	if (rte_pmu.name == NULL)
> +		return -ENODEV;
> +
> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
> +		return -ENOSPC;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> +	if (access(path, R_OK))
> +		return -ENODEV;
> +
> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +		if (!strcmp(event->name, name))
> +			return event->index;
> +		continue;
> +	}
> +
> +	event = new_event(name);
> +	if (event == NULL)
> +		return -ENOMEM;
> +
> +	event->index = rte_pmu.num_group_events++;
> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
> +
> +	return event->index;
> +}
> +
> +int
> +rte_pmu_init(void)
> +{
> +	int ret;
> +
> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
> +	 * via command line but application doesn't care enough and performs init/fini again.
> +	 */
> +	if (rte_pmu.initialized != 0) {
> +		rte_pmu.initialized++;
> +		return 0;
> +	}
> +
> +	ret = scan_pmus();
> +	if (ret)
> +		goto out;
> +
> +	ret = pmu_arch_init();
> +	if (ret)
> +		goto out;
> +
> +	TAILQ_INIT(&rte_pmu.event_list);
> +	TAILQ_INIT(&rte_pmu.event_group_list);
> +	rte_spinlock_init(&rte_pmu.lock);
> +	rte_pmu.initialized = 1;
> +
> +	return 0;
> +out:
> +	free(rte_pmu.name);
> +	rte_pmu.name = NULL;
> +
> +	return ret;
> +}
> +
> +void
> +rte_pmu_fini(void)
> +{
> +	struct rte_pmu_event_group *group, *tmp_group;
> +	struct rte_pmu_event *event, *tmp_event;
> +
> +	/* cleanup once init count drops to zero */
> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
> +		return;
> +
> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
> +		free_event(event);
> +	}
> +
> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
> +		cleanup_events(group);
> +	}
> +
> +	pmu_arch_fini();
> +	free(rte_pmu.name);
> +	rte_pmu.name = NULL;
> +	rte_pmu.num_group_events = 0;
> +}
> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
> new file mode 100644
> index 0000000000..6b664c3336
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.h
> @@ -0,0 +1,212 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Marvell
> + */
> +
> +#ifndef _RTE_PMU_H_
> +#define _RTE_PMU_H_
> +
> +/**
> + * @file
> + *
> + * PMU event tracing operations
> + *
> + * This file defines generic API and types necessary to setup PMU and
> + * read selected counters in runtime.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <linux/perf_event.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_branch_prediction.h>
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +#include <rte_spinlock.h>
> +
> +/** Maximum number of events in a group */
> +#define MAX_NUM_GROUP_EVENTS 8

forgot RTE_ prefix.
In fact, do you really need number of events in group to be hard-coded?
Couldn't mmap_pages[] and fds[] be allocated dynamically by enable_group()?

> +
> +/**
> + * A structure describing a group of events.
> + */
> +struct rte_pmu_event_group {
> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> +	bool enabled; /**< true if group was enabled on particular lcore */
> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
> +} __rte_cache_aligned;
> +

Even if we'd decide to keep rte_pmu_read() as static inline (still not 
sure it is a good idea),
why these two struct below (rte_pmu_event and rte_pmu) have to be public?
I think both can be safely moved away from public headers.


> +/**
> + * A structure describing an event.
> + */
> +struct rte_pmu_event {
> +	char *name; /**< name of an event */
> +	unsigned int index; /**< event index into fds/mmap_pages */
> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
> +};

> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> +	rte_spinlock_t lock; /**< serialize access to event group list */
> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> +	unsigned int num_group_events; /**< number of events in a group */
> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> +	unsigned int initialized; /**< initialization counter */
> +};
> +
> +/** lcore event group */
> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> +
> +/** PMU state container */
> +extern struct rte_pmu rte_pmu;
> +
> +/** Each architecture supporting PMU needs to provide its own version */
> +#ifndef rte_pmu_pmc_read
> +#define rte_pmu_pmc_read(index) ({ 0; })
> +#endif
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read PMU counter.
> + *
> + * @warning This should be not called directly.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +static __rte_always_inline uint64_t
> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +	uint64_t width, offset;
> +	uint32_t seq, index;
> +	int64_t pmc;
> +
> +	for (;;) {
> +		seq = pc->lock;
> +		rte_compiler_barrier();
> +		index = pc->index;
> +		offset = pc->offset;
> +		width = pc->pmc_width;
> +
> +		/* index set to 0 means that particular counter cannot be used */
> +		if (likely(pc->cap_user_rdpmc && index)) {

In mmap_events() you return EPERM if cap_user_rdpmc is not enabled.
Do you need another check here? Or this capability can be disabled by 
kernel at run-time?


> +			pmc = rte_pmu_pmc_read(index - 1);
> +			pmc <<= 64 - width;
> +			pmc >>= 64 - width;
> +			offset += pmc;
> +		}
> +
> +		rte_compiler_barrier();
> +
> +		if (likely(pc->lock == seq))
> +			return offset;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Enable group of events on the calling lcore.
> + *
> + * @warning This should be not called directly.

__rte_internal ?

> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +__rte_pmu_enable_group(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Initialize PMU library.
> + *
> + * @warning This should be not called directly.

Hmm.. then who should call it?
If it not supposed to be called directly, why to declare it here?

> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */

Probably worth to mention that this function is not MT safe.
Same for _fini_ and add_event.
Also worth to mention that all control-path functions 
(init/fini/add_event) and data-path (pmu_read) can't be called concurrently.

> +__rte_experimental
> +int
> +rte_pmu_init(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
> + */
> +__rte_experimental
> +void
> +rte_pmu_fini(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add event to the group of enabled events.
> + *
> + * @param name
> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> + * @return
> + *   Event index in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_add_event(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(unsigned int index)
> +{
> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> +	int ret;
> +
> +	if (unlikely(!rte_pmu.initialized))
> +		return 0;
> +
> +	if (unlikely(!group->enabled)) {
> +		ret = __rte_pmu_enable_group();
> +		if (ret)
> +			return 0;
> +	}
> +
> +	if (unlikely(index >= rte_pmu.num_group_events))
> +		return 0;
> +
> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PMU_H_ */
> diff --git a/lib/pmu/version.map b/lib/pmu/version.map
> new file mode 100644
> index 0000000000..39a4f279c1
> --- /dev/null
> +++ b/lib/pmu/version.map
> @@ -0,0 +1,15 @@
> +DPDK_23 {
> +	local: *;
> +};
> +
> +EXPERIMENTAL {
> +	global:
> +
> +	__rte_pmu_enable_group;
> +	per_lcore__event_group;
> +	rte_pmu;
> +	rte_pmu_add_event;
> +	rte_pmu_fini;
> +	rte_pmu_init;
> +	rte_pmu_read;
> +};



More information about the dev mailing list