[dpdk-dev] [PATCH v2 1/4] eal: add channel for multi-process communication

Jianfeng Tan jianfeng.tan at intel.com
Thu Jan 11 05:07:31 CET 2018


Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
  1. Config-file based channel, in which, the primary process writes
     info into a pre-defined config file, and the secondary process
     reads the info out.
  2. vfio submodule has its own channel based on unix socket for the
     secondary process to get container fd and group fd from the
     primary process.
  3. pdump submodule also has its own channel based on unix socket for
     packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accomodate the requirements including:
  a. Secondary wants to send info to primary, for example, secondary
     would like to send request (about some specific vdev to primary).
  b. Sending info at any time, instead of just initialization time.
  c. Share FDs with the other side, for vdev like vhost, related FDs
     (memory region, kick) should be shared.
  d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

  1. rte_eal_mp_action_register() is used to register an action,
     indexed by a string, when a component at receiver side would like
     to response the messages from the peer processe.
  2. rte_eal_mp_action_unregister() is used to unregister the action
     if the calling component does not want to response the messages.
  3. rte_eal_mp_sendmsg() is used to send a message, and returns
     immediately. If there are 1:n primary:secondary processes, the
     primary process will send n messages.

Suggested-by: Konstantin Ananyev <konstantin.ananyev at intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
---
 lib/librte_eal/common/eal_common_proc.c | 388 ++++++++++++++++++++++++++++++++
 lib/librte_eal/common/eal_filesystem.h  |  17 ++
 lib/librte_eal/common/eal_private.h     |  10 +
 lib/librte_eal/common/include/rte_eal.h |  69 ++++++
 lib/librte_eal/linuxapp/eal/eal.c       |   8 +
 lib/librte_eal/rte_eal_version.map      |   9 +
 6 files changed, 501 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..d700e9e 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -5,11 +5,55 @@
 #include <stdio.h>
 #include <fcntl.h>
 #include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <limits.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include <rte_log.h>
 #include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
 
+#include "eal_private.h"
 #include "eal_filesystem.h"
 #include "eal_internal_cfg.h"
 
+#define MAX_SECONDARY_PROCS	8
+#define MAX_ACTION_NAME_LEN	64
+#define MAX_UNIX_PATH_LEN	104
+#define MAX_MSG_LENGTH		1024
+#define SCM_MAX_FD		253 /* The max amount of fds */
+
+static int mp_fd = -1;
+static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+	TAILQ_ENTRY(action_entry) next;      /**< Next attached action entry */
+
+#define MAX_ACTION_NAME_LEN	64
+	char action_name[MAX_ACTION_NAME_LEN];
+	rte_eal_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+	TAILQ_HEAD_INITIALIZER(action_entry_list);
+
+struct mp_msghdr {
+	char action_name[MAX_ACTION_NAME_LEN];
+	int fds_num;
+	int len_params;
+	char params[0];
+} __rte_packed;
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -31,3 +75,347 @@ rte_eal_primary_proc_alive(const char *config_file_path)
 
 	return !!ret;
 }
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+	int len = strlen(name);
+	struct action_entry *entry;
+
+	TAILQ_FOREACH(entry, &action_entry_list, next) {
+		if (strncmp(entry->action_name, name, len) == 0)
+			break;
+	}
+
+	return entry;
+}
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+	struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+	if (entry == NULL) {
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+
+	if (strlen(action_name) > MAX_ACTION_NAME_LEN) {
+		rte_errno = -E2BIG;
+		return -1;
+	}
+
+	pthread_mutex_lock(&mp_mutex_action);
+	if (find_action_entry_by_name(action_name) != NULL) {
+		free(entry);
+		rte_errno = -EEXIST;
+		return -1;
+	}
+	strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+	entry->action = action;
+	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{
+	struct action_entry *entry;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(name);
+	TAILQ_REMOVE(&action_entry_list, entry, next);
+	free(entry);
+	pthread_mutex_unlock(&mp_mutex_action);
+}
+
+static int
+read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+{
+	int ret;
+	struct iovec iov;
+	struct msghdr msgh;
+	size_t fdsize = fds_num * sizeof(int);
+	char control[CMSG_SPACE(fdsize)];
+	struct cmsghdr *cmsg;
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = buf;
+	iov.iov_len  = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	ret = recvmsg(fd, &msgh, 0);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+		RTE_LOG(ERR, EAL, "truncted msg\n");
+		return -1;
+	}
+
+	/* read auxiliary FDs if any */
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(fds, CMSG_DATA(cmsg), fdsize);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int
+process_msg(struct mp_msghdr *hdr, int len, int fds[])
+{
+	int ret;
+	int params_len;
+	struct action_entry *entry;
+
+	RTE_LOG(DEBUG, EAL, "msg: %s\n", hdr->action_name);
+
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(hdr->action_name);
+	if (entry == NULL) {
+		RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
+			hdr->action_name);
+		pthread_mutex_unlock(&mp_mutex_action);
+		return -1;
+	}
+
+	params_len = len - sizeof(struct mp_msghdr);
+	ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
+	pthread_mutex_unlock(&mp_mutex_action);
+	return ret;
+
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+	int len;
+	int fds[SCM_MAX_FD];
+	char buf[MAX_MSG_LENGTH];
+
+	while (1) {
+		len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
+		if (len > 0)
+			process_msg((struct mp_msghdr *)buf, len, fds);
+	}
+
+	return NULL;
+}
+
+static inline const char *
+get_unix_path(int is_server)
+{
+	static char unix_path[MAX_UNIX_PATH_LEN];
+	const char *prefix = eal_mp_unix_path();
+	const char *suffix = (is_server) ? "" : "_c";
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s", prefix, suffix);
+	else
+		snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s_%d",
+			 prefix, suffix, getpid());
+	return unix_path;
+}
+
+static int
+open_unix_fd(int is_server)
+{
+	int fd;
+	struct sockaddr_un un;
+
+	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+		return -1;
+	}
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	snprintf(un.sun_path, MAX_UNIX_PATH_LEN, "%s",
+		 get_unix_path(is_server));
+	unlink(un.sun_path); /* May still exist since last run */
+	if (bind(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+		RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+			un.sun_path, strerror(errno));
+		close(fd);
+		return -1;
+	}
+
+	RTE_LOG(INFO, EAL, "bind to %s\n", un.sun_path);
+	return fd;
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+	pthread_t tid;
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+	mp_fd = open_unix_fd(1);
+	if (mp_fd < 0)
+		return -1;
+
+	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
+			strerror(errno));
+		goto error;
+	}
+
+	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+	if (rte_thread_setname(tid, thread_name) < 0) {
+		RTE_LOG(ERR, EAL, "failed to set thead name\n");
+		goto error;
+	}
+
+	return 0;
+error:
+	close(mp_fd);
+	mp_fd = -1;
+	return -1;
+}
+
+static inline struct mp_msghdr *
+format_msg(const char *act_name, const void *p, int len_params, int fds_num)
+{
+	int len_msg;
+	struct mp_msghdr *msg;
+
+	len_msg = sizeof(struct mp_msghdr) + len_params;
+	if (len_msg > MAX_MSG_LENGTH) {
+		RTE_LOG(ERR, EAL, "Message is too long\n");
+		rte_errno = -EINVAL;
+		return NULL;
+	}
+
+	msg = malloc(len_msg);
+	if (!msg) {
+		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+		rte_errno = -ENOMEM;
+		return NULL;
+	}
+	memset(msg, 0, len_msg);
+	strcpy(msg->action_name, act_name);
+	msg->fds_num = fds_num;
+	msg->len_params = len_params;
+	memcpy(msg->params, p, len_params);
+	return msg;
+}
+
+static int
+send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
+{
+	int ret;
+	struct msghdr msgh;
+	struct iovec iov;
+	size_t fd_size = msg->fds_num * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+	struct cmsghdr *cmsg;
+	struct sockaddr_un dst;
+
+	memset(&dst, 0, sizeof(dst));
+	dst.sun_family = AF_UNIX;
+	snprintf(dst.sun_path, MAX_UNIX_PATH_LEN, "%s", dst_path);
+
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+
+	iov.iov_base = (uint8_t *)msg;
+	iov.iov_len = sizeof(struct mp_msghdr) + msg->len_params;
+
+	msgh.msg_name = &dst;
+	msgh.msg_namelen = sizeof(dst);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(fd_size);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), fds, fd_size);
+
+	do {
+		ret = sendmsg(fd, &msgh, 0);
+	} while (ret < 0 && errno == EINTR);
+
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
+
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+			RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
+				dst_path);
+		else if (!rte_eal_primary_proc_alive(NULL))
+			RTE_LOG(ERR, EAL, "primary process exited\n");
+
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+mp_send(const char *action_name,
+	const void *params,
+	int len_params,
+	int fds[],
+	int fds_num)
+{
+	int i;
+	int n = 0;
+	int sockfd;
+	struct mp_msghdr *msg;
+
+	if (fds_num > SCM_MAX_FD) {
+		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+		rte_errno = -E2BIG;
+		return 0;
+	}
+
+	msg = format_msg(action_name, params, len_params, fds_num);
+	if (msg == NULL)
+		return 0;
+
+	if ((sockfd = open_unix_fd(0)) < 0) {
+		free(msg);
+		return 0;
+	}
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		/* broadcast to all secondaries */
+		for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+			if (mp_sec_sockets[i] == NULL)
+				continue;
+
+			n += send_msg(sockfd, mp_sec_sockets[i], msg, fds);
+		}
+	} else
+		n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);
+
+	free(msg);
+	close(sockfd);
+	return n;
+}
+
+int
+rte_eal_mp_sendmsg(const char *action_name,
+		   const void *params,
+		   int len_params,
+		   int fds[],
+		   int fds_num)
+{
+	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
+	return mp_send(action_name, params, len_params, fds, fds_num);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..e95399b 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
 	return buffer;
 }
 
+/** Path of primary/secondary communication unix socket file. */
+#define MP_UNIX_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_unix_path(void)
+{
+	static char buffer[PATH_MAX]; /* static so auto-zeroed */
+	const char *directory = default_config_dir;
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		directory = home_dir;
+	snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
+		 directory, internal_config.hugefile_prefix);
+
+	return buffer;
+}
+
 /** Path of hugepage info file. */
 #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..e36e3b5 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
  */
 struct rte_bus *rte_bus_find_by_device_name(const char *str);
 
+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ *   0 on success;
+ *   (<0) on failure.
+ */
+
+int rte_eal_mp_channel_init(void);
+
 #endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 02fa109..9884c0b 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -186,6 +186,75 @@ int rte_eal_init(int argc, char **argv);
 int rte_eal_primary_proc_alive(const char *config_file_path);
 
 /**
+ * Action function typedef used by other components.
+ *
+ * As we create unix socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const void *params, int len,
+			    int fds[], int fds_num);
+
+/**
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param action_name
+ *   The action_name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ *   The action argument is the function pointer to the action function.
+ *
+ * @return
+ *  - 0 on success.
+ *  - (<0) on failure.
+ */
+int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
+
+/**
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action  if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param action_name
+ *   The action_name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_eal_mp_action_unregister(const char *name);
+
+/**
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by action_name of the process on the other side.
+ *
+ * @param action_name
+ *   The action_name argument is used to identify which action will be used.
+ *
+ * @param params
+ *   The params argument contains the customized message.
+ *
+ * @param len_params
+ *   The len_params argument is the length of the customized message.
+ *
+ * @param fds
+ *   The fds argument is an array of fds sent with sendmsg.
+ *
+ * @param fds_num
+ *   The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * @return
+ *  - Returns the number of messages being sent successfully.
+ */
+int
+rte_eal_mp_sendmsg(const char *action_name, const void *params,
+		   int len_params, int fds[], int fds_num);
+
+/**
  * Usage function typedef used by the application usage function.
  *
  * Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..f231724 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -896,6 +896,14 @@ rte_eal_init(int argc, char **argv)
 
 	eal_check_mem_on_local_socket();
 
+	if (rte_eal_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 	eal_thread_init_master(rte_config.master_lcore);
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f4f46c1..5dacde5 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -235,4 +235,13 @@ EXPERIMENTAL {
 	rte_service_set_stats_enable;
 	rte_service_start_with_defaults;
 
+} DPDK_17.08;
+
+DPDK_18.02 {
+	global:
+
+	rte_eal_mp_action_register;
+	rte_eal_mp_action_unregister;
+	rte_eal_mp_sendmsg;
+
 } DPDK_17.11;
-- 
2.7.4



More information about the dev mailing list