[dpdk-dev,2/5] gso/lib: add TCP/IPv4 GSO support

Message ID 1503584144-63181-3-git-send-email-jiayu.hu@intel.com (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/Intel-compilation fail Compilation issues
ci/checkpatch success coding style OK

Commit Message

Hu, Jiayu Aug. 24, 2017, 2:15 p.m. UTC
  This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.

TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.

If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.

Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
---
 lib/librte_eal/common/include/rte_log.h |   1 +
 lib/librte_gso/Makefile                 |   2 +
 lib/librte_gso/gso_common.c             | 270 ++++++++++++++++++++++++++++++++
 lib/librte_gso/gso_common.h             | 120 ++++++++++++++
 lib/librte_gso/gso_tcp.c                |  82 ++++++++++
 lib/librte_gso/gso_tcp.h                |  73 +++++++++
 lib/librte_gso/rte_gso.c                |  44 +++++-
 lib/librte_gso/rte_gso.h                |   3 +
 8 files changed, 593 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_gso/gso_common.c
 create mode 100644 lib/librte_gso/gso_common.h
 create mode 100644 lib/librte_gso/gso_tcp.c
 create mode 100644 lib/librte_gso/gso_tcp.h
  

Comments

Ananyev, Konstantin Aug. 30, 2017, 1:38 a.m. UTC | #1
> -----Original Message-----
> From: Hu, Jiayu
> Sent: Thursday, August 24, 2017 3:16 PM
> To: dev@dpdk.org
> Cc: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng
> <jianfeng.tan@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
> Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> 
> This patch adds GSO support for TCP/IPv4 packets. Supported packets
> may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
> packets have correct checksums, and doesn't update checksums for output
> packets (the responsibility for this lies with the application).
> Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
> 
> TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
> MBUF, to organize an output packet. Note that we refer to these two
> chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
> header, while the indirect mbuf simply points to a location within the
> original packet's payload. Consequently, use of the GSO library requires
> multi-segment MBUF support in the TX functions of the NIC driver.
> 
> If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
> result, when all of its GSOed segments are freed, the packet is freed
> automatically.
> 
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> ---
>  lib/librte_eal/common/include/rte_log.h |   1 +
>  lib/librte_gso/Makefile                 |   2 +
>  lib/librte_gso/gso_common.c             | 270 ++++++++++++++++++++++++++++++++
>  lib/librte_gso/gso_common.h             | 120 ++++++++++++++
>  lib/librte_gso/gso_tcp.c                |  82 ++++++++++
>  lib/librte_gso/gso_tcp.h                |  73 +++++++++
>  lib/librte_gso/rte_gso.c                |  44 +++++-
>  lib/librte_gso/rte_gso.h                |   3 +
>  8 files changed, 593 insertions(+), 2 deletions(-)
>  create mode 100644 lib/librte_gso/gso_common.c
>  create mode 100644 lib/librte_gso/gso_common.h
>  create mode 100644 lib/librte_gso/gso_tcp.c
>  create mode 100644 lib/librte_gso/gso_tcp.h
> 
> diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
> index ec8dba7..2fa1199 100644
> --- a/lib/librte_eal/common/include/rte_log.h
> +++ b/lib/librte_eal/common/include/rte_log.h
> @@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
>  #define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
>  #define RTE_LOGTYPE_EFD       18 /**< Log related to EFD. */
>  #define RTE_LOGTYPE_EVENTDEV  19 /**< Log related to eventdev. */
> +#define RTE_LOGTYPE_GSO       20 /**< Log related to GSO. */
> 
>  /* these log types can be used in an application */
>  #define RTE_LOGTYPE_USER1     24 /**< User-defined log type 1. */
> diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
> index aeaacbc..0f8e38f 100644
> --- a/lib/librte_gso/Makefile
> +++ b/lib/librte_gso/Makefile
> @@ -42,6 +42,8 @@ LIBABIVER := 1
> 
>  #source files
>  SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
> +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
> +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c
> 
>  # install this header file
>  SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
> diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
> new file mode 100644
> index 0000000..2b54fbd
> --- /dev/null
> +++ b/lib/librte_gso/gso_common.c
> @@ -0,0 +1,270 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdbool.h>
> +#include <string.h>
> +
> +#include <rte_malloc.h>
> +
> +#include <rte_ether.h>
> +#include <rte_ip.h>
> +#include <rte_tcp.h>
> +
> +#include "gso_common.h"
> +
> +static inline void
> +hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
> +		uint16_t pkt_hdr_offset)
> +{
> +	/* copy mbuf metadata */
> +	hdr_segment->nb_segs = 1;
> +	hdr_segment->port = pkt->port;
> +	hdr_segment->ol_flags = pkt->ol_flags;
> +	hdr_segment->packet_type = pkt->packet_type;
> +	hdr_segment->pkt_len = pkt_hdr_offset;
> +	hdr_segment->data_len = pkt_hdr_offset;
> +	hdr_segment->tx_offload = pkt->tx_offload;
> +	/* copy packet header */
> +	rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
> +			rte_pktmbuf_mtod(pkt, char *),
> +			pkt_hdr_offset);
> +}
> +
> +static inline void
> +free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
> +{
> +	uint16_t i;
> +
> +	for (i = 0; i < nb_pkts; i++) {
> +		rte_pktmbuf_detach(pkts[i]->next);

I don't think you need to call detach() here explicitly.
Just rte_pktmbuf_free(pkts[i]) should do I think.

> +		rte_pktmbuf_free(pkts[i]);
> +		pkts[i] = NULL;
> +	}
> +}
> +
> +int
> +gso_do_segment(struct rte_mbuf *pkt,
> +		uint16_t pkt_hdr_offset,
> +		uint16_t pyld_unit_size,
> +		struct rte_mempool *direct_pool,
> +		struct rte_mempool *indirect_pool,
> +		struct rte_mbuf **pkts_out,
> +		uint16_t nb_pkts_out)
> +{
> +	struct rte_mbuf *pkt_in;
> +	struct rte_mbuf *hdr_segment, *pyld_segment;
> +	uint32_t pkt_in_pyld_off;
> +	uint16_t pkt_in_segment_len, pkt_out_segment_len;
> +	uint16_t nb_segs;
> +	bool pkt_in_segment_processed;
> +
> +	pkt_in_pyld_off = pkt->data_off + pkt_hdr_offset;
> +	pkt_in = pkt;
> +	nb_segs = 0;
> +
> +	while (pkt_in) {
> +		pkt_in_segment_processed = false;
> +		pkt_in_segment_len = pkt_in->data_off + pkt_in->data_len;
> +
> +		while (!pkt_in_segment_processed) {
> +			if (unlikely(nb_segs >= nb_pkts_out)) {
> +				free_gso_segment(pkts_out, nb_segs);
> +				return -EINVAL;
> +			}
> +
> +			/* allocate direct mbuf */
> +			hdr_segment = rte_pktmbuf_alloc(direct_pool);
> +			if (unlikely(hdr_segment == NULL)) {
> +				free_gso_segment(pkts_out, nb_segs);
> +				return -ENOMEM;
> +			}
> +
> +			/* allocate indirect mbuf */
> +			pyld_segment = rte_pktmbuf_alloc(indirect_pool);
> +			if (unlikely(pyld_segment == NULL)) {
> +				rte_pktmbuf_free(hdr_segment);
> +				free_gso_segment(pkts_out, nb_segs);
> +				return -ENOMEM;
> +			}

So, if I understand correctly each new packet would always contain just one data segment?
Why several data segments couldn't be chained together (if sum of their data_len <= mss)?
In a same way as done here:
http://dpdk.org/browse/dpdk/tree/lib/librte_ip_frag/rte_ipv4_fragmentation.c#n93
or here:
https://gerrit.fd.io/r/gitweb?p=tldk.git;a=blob;f=lib/libtle_l4p/tcp_tx_seg.h;h=a8d2425597a7ad6f598aa4bb7fcd7f1da74305f0;hb=HEAD#l23
?

> +
> +			/* copy packet header */
> +			hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
> +
> +			/* attach payload mbuf to current packet segment */
> +			rte_pktmbuf_attach(pyld_segment, pkt_in);
> +
> +			hdr_segment->next = pyld_segment;
> +			pkts_out[nb_segs++] = hdr_segment;
> +
> +			/* calculate payload length */
> +			pkt_out_segment_len = pyld_unit_size;
> +			if (pkt_in_pyld_off + pkt_out_segment_len >
> +					pkt_in_segment_len) {
> +				pkt_out_segment_len = pkt_in_segment_len -
> +					pkt_in_pyld_off;
> +			}
> +
> +			/* update payload segment */
> +			pyld_segment->data_off = pkt_in_pyld_off;
> +			pyld_segment->data_len = pkt_out_segment_len;
> +
> +			/* update header segment */
> +			hdr_segment->pkt_len += pyld_segment->data_len;
> +			hdr_segment->nb_segs++;
> +
> +			/* update pkt_in_pyld_off */
> +			pkt_in_pyld_off += pkt_out_segment_len;
> +			if (pkt_in_pyld_off == pkt_in_segment_len)
> +				pkt_in_segment_processed = true;
> +		}
> +
> +		/* 'pkt_in' may contain numerous segments */
> +		pkt_in = pkt_in->next;
> +		if (pkt_in != NULL)
> +			pkt_in_pyld_off = pkt_in->data_off;
> +	}
> +	return nb_segs;
> +}
> +
> +static inline void
> +parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
> +{
> +	struct tcp_hdr *tcp_hdr;
> +
> +	switch (ipv4_hdr->next_proto_id) {
> +	case IPPROTO_TCP:
> +		pkt->packet_type |= RTE_PTYPE_L4_TCP;
> +		pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
> +		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
> +		pkt->l4_len = TCP_HDR_LEN(tcp_hdr);
> +		break;
> +	}
> +}
> +
> +static inline void
> +parse_ethernet(struct ether_hdr *eth_hdr, struct rte_mbuf *pkt)
> +{
> +	struct ipv4_hdr *ipv4_hdr;
> +	struct vlan_hdr *vlan_hdr;
> +	uint16_t ethertype;
> +
> +	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
> +	if (ethertype == ETHER_TYPE_VLAN) {
> +		vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
> +		pkt->l2_len = sizeof(struct vlan_hdr);
> +		pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
> +		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
> +	}
> +
> +	switch (ethertype) {
> +	case ETHER_TYPE_IPv4:
> +		if (IS_VLAN_PKT(pkt)) {
> +			pkt->packet_type |= RTE_PTYPE_L3_IPV4;
> +		} else {
> +			pkt->packet_type |= RTE_PTYPE_L2_ETHER;
> +			pkt->packet_type |= RTE_PTYPE_L3_IPV4;
> +		}
> +		pkt->l2_len += sizeof(struct ether_hdr);
> +		ipv4_hdr = (struct ipv4_hdr *) ((char *)eth_hdr +
> +				pkt->l2_len);
> +		parse_ipv4(ipv4_hdr, pkt);
> +		break;
> +	}
> +}
> +
> +void
> +gso_parse_packet(struct rte_mbuf *pkt)

There is a function rte_net_get_ptype() that supposed to provide similar functionality.
So we probably don't need to create a new SW parse function here, instead would be better
to reuse (and update if needed) an existing one.
Again user already might have l2/l3/l4.../_len and packet_type setuped.
So better to keep SW packet parsing out of scope of that library. 

> +{
> +	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> +
> +	pkt->packet_type = pkt->tx_offload = 0;
> +	parse_ethernet(eth_hdr, pkt);
> +}
> +
> +static inline void
> +update_ipv4_header(char *base, uint16_t offset, uint16_t length, uint16_t id)
> +{
> +	struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)(base + offset);
> +
> +	ipv4_hdr->total_length = rte_cpu_to_be_16(length - offset);
> +	ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
> +}
> +
> +static inline void
> +update_tcp_header(char *base, uint16_t offset, uint32_t sent_seq,
> +	uint8_t non_tail)
> +{
> +	struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)(base + offset);
> +
> +	tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
> +	/* clean FIN and PSH for non-tail segments */
> +	if (non_tail)
> +		tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK | TCP_HDR_FIN_MASK));
> +}
> +
> +void
> +gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
> +		struct rte_mbuf **out_segments)
> +{
> +	struct ipv4_hdr *ipv4_hdr;
> +	struct tcp_hdr *tcp_hdr;
> +	struct rte_mbuf *seg;
> +	uint32_t sent_seq;
> +	uint16_t offset, i;
> +	uint16_t tail_seg_idx = nb_segments - 1, id;
> +
> +	switch (pkt->packet_type) {
> +	case ETHER_VLAN_IPv4_TCP_PKT:
> +	case ETHER_IPv4_TCP_PKT:

Might be worth to put code below in a separate function:
update_inner_tcp_hdr(..) or so.
Then you can reuse it for tunneled cases too.

> +		ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) 
> +				pkt->l2_len);
> +		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
> +		id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
> +		sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
> +
> +		for (i = 0; i < nb_segments; i++) {
> +			seg = out_segments[i];
> +
> +			offset = seg->l2_len;
> +			update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
> +					offset, seg->pkt_len, id);
> +			id++;

Who would be responsible to make sure that we wouldn't have consecutive packets with the IPV4 id?
Would be the upper layer that forms the packet or gso library or ...?

> +
> +			offset += seg->l3_len;
> +			update_tcp_header(rte_pktmbuf_mtod(seg, char *),
> +					offset, sent_seq, i < tail_seg_idx);
> +			sent_seq += seg->next->data_len;
> +		}
> +		break;
> +	}
> +}
> diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
> new file mode 100644
> index 0000000..d750041
> --- /dev/null
> +++ b/lib/librte_gso/gso_common.h
> @@ -0,0 +1,120 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _GSO_COMMON_H_
> +#define _GSO_COMMON_H_
> +
> +#include <stdint.h>
> +#include <rte_mbuf.h>
> +
> +#define IPV4_HDR_DF_SHIFT 14
> +#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
> +#define IPv4_HDR_LEN(iph) ((iph->version_ihl & 0x0f) * 4)
> +
> +#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
> +#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
> +#define TCP_HDR_LEN(tcph) ((tcph->data_off & 0xf0) >> 2)
> +
> +#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
> +/* Supported packet types */
> +/* TCP/IPv4 packet. */
> +#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
> +
> +/* TCP/IPv4 packet with VLAN tag. */
> +#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
> +		RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
> +
> +#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == \
> +		RTE_PTYPE_L2_ETHER_VLAN)
> +
> +/**
> + * Internal function which parses a packet, setting outer_l2/l3_len and
> + * l2/l3/l4_len and packet_type.
> + *
> + * @param pkt
> + *  Packet to parse.
> + */
> +void gso_parse_packet(struct rte_mbuf *pkt);
> +
> +/**
> + * Internal function which updates relevant packet headers, following
> + * segmentation. This is required to update, for example, the IPv4
> + * 'total_length' field, to reflect the reduced length of the now-
> + * segmented packet.
> + *
> + * @param pkt
> + *  The original packet.
> + * @param nb_segments
> + *  The number of GSO segments into which pkt was split.
> + * @param out_segements
> + *  Pointer array used for storing mbuf addresses for GSO segments.
> + */
> +void gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
> +		struct rte_mbuf **out_segments);
> +
> +/**
> + * Internal function which divides the input packet into small segments.
> + * Each of the newly-created segments is organized as a two-segment mbuf,
> + * where the first segment is a standard mbuf, which stores a copy of
> + * packet header, and the second is an indirect mbuf which points to a
> + * section of data in the input packet.
> + *
> + * @param pkt
> + *  Packet to segment.
> + * @param pkt_hdr_offset
> + *  Packet header offset, measured in byte.
> + * @param pyld_unit_size
> + *  The max payload length of a GSO segment.
> + * @param direct_pool
> + *  MBUF pool used for allocating direct buffers for output segments.
> + * @param indirect_pool
> + *  MBUF pool used for allocating indirect buffers for output segments.
> + * @param pkts_out
> + *  Pointer array used to keep the mbuf addresses of output segments.
> + * @param nb_pkts_out
> + *  The max number of items that pkts_out can keep.
> + *
> + * @return
> + *  - The number of segments created in the event of success.
> + *  - If no GSO is performed, return 1.
> + *  - If available memory in mempools is insufficient, return -ENOMEM.
> + *  - -EINVAL for invalid parameters
> + */
> +int gso_do_segment(struct rte_mbuf *pkt,
> +		uint16_t pkt_hdr_offset,
> +		uint16_t pyld_unit_size,
> +		struct rte_mempool *direct_pool,
> +		struct rte_mempool *indirect_pool,
> +		struct rte_mbuf **pkts_out,
> +		uint16_t nb_pkts_out);
> +#endif
> diff --git a/lib/librte_gso/gso_tcp.c b/lib/librte_gso/gso_tcp.c
> new file mode 100644
> index 0000000..9d5fc30
> --- /dev/null
> +++ b/lib/librte_gso/gso_tcp.c
> @@ -0,0 +1,82 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +
> +#include <rte_ether.h>
> +#include <rte_ip.h>
> +#include <rte_tcp.h>
> +
> +#include "gso_common.h"
> +#include "gso_tcp.h"
> +
> +int
> +gso_tcp4_segment(struct rte_mbuf *pkt,
> +		uint16_t gso_size,
> +		struct rte_mempool *direct_pool,
> +		struct rte_mempool *indirect_pool,
> +		struct rte_mbuf **pkts_out,
> +		uint16_t nb_pkts_out)
> +{
> +	struct ether_hdr *eth_hdr;
> +	struct ipv4_hdr *ipv4_hdr;
> +	uint16_t tcp_dl;
> +	uint16_t pyld_unit_size;
> +	uint16_t hdr_offset;
> +	int ret = 1;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len);
> +
> +	/* don't process fragmented packet */
> +	if ((ipv4_hdr->fragment_offset &
> +				rte_cpu_to_be_16(IPV4_HDR_DF_MASK)) == 0)
> +		return ret;
> +
> +	tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) -
> +		pkt->l3_len - pkt->l4_len;
> +	/* don't process packet without data */
> +	if (tcp_dl == 0)
> +		return ret;
> +
> +	hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
> +	pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
> +
> +	/* segment the payload */
> +	ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
> +			indirect_pool, pkts_out, nb_pkts_out);
> +
> +	if (ret > 1)
> +		gso_update_pkt_headers(pkt, ret, pkts_out);
> +
> +	return ret;
> +}
> diff --git a/lib/librte_gso/gso_tcp.h b/lib/librte_gso/gso_tcp.h
> new file mode 100644
> index 0000000..f291ccb
> --- /dev/null
> +++ b/lib/librte_gso/gso_tcp.h
> @@ -0,0 +1,73 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _GSO_TCP_H_
> +#define _GSO_TCP_H_
> +
> +#include <stdint.h>
> +#include <rte_mbuf.h>
> +
> +/**
> + * Segment an IPv4/TCP packet. This function assumes the input packet has
> + * correct checksums and doesn't update checksums for GSO segment.
> + * Furthermore, it doesn't process IP fragment packets.
> + *
> + * @param pkt
> + *  The packet mbuf to segment.
> + * @param gso_size
> + *  The max length of a GSO segment, measured in bytes.
> + * @param direct_pool
> + *  MBUF pool used for allocating direct buffers for output segments.
> + * @param indirect_pool
> + *  MBUF pool used for allocating indirect buffers for output segments.
> + * @param pkts_out
> + *  Pointer array, which is used to store mbuf addresses of GSO segments.
> + *  Caller should guarantee that 'pkts_out' is sufficiently large to store
> + *  all GSO segments.
> + * @param nb_pkts_out
> + *  The max number of items that 'pkts_out' can keep.
> + *
> + * @return
> + *   - The number of GSO segments on success.
> + *   - Return 1 if no GSO is performed.
> + *   - Return -ENOMEM if available memory in mempools is insufficient.
> + *   - Return -EINVAL for invalid parameters.
> + */
> +int gso_tcp4_segment(struct rte_mbuf *pkt,
> +		uint16_t gso_size,
> +		struct rte_mempool *direct_pool,
> +		struct rte_mempool *indirect_pool,
> +		struct rte_mbuf **pkts_out,
> +		uint16_t nb_pkts_out);
> +
> +#endif
> diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
> index b81afce..fac95f2 100644
> --- a/lib/librte_gso/rte_gso.c
> +++ b/lib/librte_gso/rte_gso.c
> @@ -31,17 +31,57 @@
>   *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>   */
> 
> +#include <rte_log.h>
> +
>  #include "rte_gso.h"
> +#include "gso_common.h"
> +#include "gso_tcp.h"
> 
>  int
>  rte_gso_segment(struct rte_mbuf *pkt,
>  		struct rte_gso_ctx gso_ctx,
>  		struct rte_mbuf **pkts_out,
> -		uint16_t nb_pkts_out __rte_unused)
> +		uint16_t nb_pkts_out)
>  {
> +	struct rte_mempool *direct_pool, *indirect_pool;
> +	struct rte_mbuf *pkt_seg;
> +	uint16_t nb_segments, gso_size;
> +
>  	if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
>  			NULL || gso_ctx.indirect_pool == NULL)
>  		return -EINVAL;

Probably we don't need to check gso_ctx values for each incoming packet.
If you feel it is necessary - create  new function rte_gso_ctx_check() that
could be performed just once per ctx.

> 
> -	return 1;
> +	if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
> +			gso_ctx.gso_size >= pkt->pkt_len ||
> +			gso_ctx.gso_size == 0)


First and third condition seems redundant.

> +		return 1;


I think you forgot here:
pkts_out[0] = pkt;


> +
> +	pkt_seg = pkt;
> +	gso_size = gso_ctx.gso_size;
> +	direct_pool = gso_ctx.direct_pool;
> +	indirect_pool = gso_ctx.indirect_pool;
> +
> +	/* Parse packet headers to determine how to segment 'pkt' */
> +	gso_parse_packet(pkt);


I don't think we need to parse packet here.
 Instead assume that user already filled packet_type and l2/l3/..._len fields correctly.

> +
> +	switch (pkt->packet_type) {
> +	case ETHER_VLAN_IPv4_TCP_PKT:
> +	case ETHER_IPv4_TCP_PKT:
> +		nb_segments = gso_tcp4_segment(pkt, gso_size,
> +				direct_pool, indirect_pool,
> +				pkts_out, nb_pkts_out);
> +		break;
> +	default:
> +		RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
> +		nb_segments = 1;
> +	}
> +
> +	if (nb_segments > 1) {
> +		while (pkt_seg) {
> +			rte_mbuf_refcnt_update(pkt_seg, -1);
> +			pkt_seg = pkt_seg->next;
> +		}
> +	}
> +
> +	return nb_segments;
>  }
> diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
> index 5a8389a..77853fa 100644
> --- a/lib/librte_gso/rte_gso.h
> +++ b/lib/librte_gso/rte_gso.h
> @@ -46,6 +46,9 @@ extern "C" {
>  #include <stdint.h>
>  #include <rte_mbuf.h>
> 
> +#define RTE_GSO_TCP_IPV4 (1ULL << 0)
> +/**< GSO flag for TCP/IPv4 packets (containing optional VLAN tag) */
> +
>  /**
>   * GSO context structure.
>   */
> --
> 2.7.4
  
Hu, Jiayu Aug. 30, 2017, 2:55 a.m. UTC | #2
Hi Konstantin,

Thanks for your important suggestions. My feedbacks are inline.

On Wed, Aug 30, 2017 at 09:38:33AM +0800, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: Hu, Jiayu
> > Sent: Thursday, August 24, 2017 3:16 PM
> > To: dev@dpdk.org
> > Cc: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng
> > <jianfeng.tan@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
> > Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> > 
> > This patch adds GSO support for TCP/IPv4 packets. Supported packets
> > may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
> > packets have correct checksums, and doesn't update checksums for output
> > packets (the responsibility for this lies with the application).
> > Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
> > 
> > TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
> > MBUF, to organize an output packet. Note that we refer to these two
> > chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
> > header, while the indirect mbuf simply points to a location within the
> > original packet's payload. Consequently, use of the GSO library requires
> > multi-segment MBUF support in the TX functions of the NIC driver.
> > 
> > If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
> > result, when all of its GSOed segments are freed, the packet is freed
> > automatically.
> > 
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> > ---
> >  lib/librte_eal/common/include/rte_log.h |   1 +
> >  lib/librte_gso/Makefile                 |   2 +
> >  lib/librte_gso/gso_common.c             | 270 ++++++++++++++++++++++++++++++++
> >  lib/librte_gso/gso_common.h             | 120 ++++++++++++++
> >  lib/librte_gso/gso_tcp.c                |  82 ++++++++++
> >  lib/librte_gso/gso_tcp.h                |  73 +++++++++
> >  lib/librte_gso/rte_gso.c                |  44 +++++-
> >  lib/librte_gso/rte_gso.h                |   3 +
> >  8 files changed, 593 insertions(+), 2 deletions(-)
> >  create mode 100644 lib/librte_gso/gso_common.c
> >  create mode 100644 lib/librte_gso/gso_common.h
> >  create mode 100644 lib/librte_gso/gso_tcp.c
> >  create mode 100644 lib/librte_gso/gso_tcp.h
> > 
> > diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
> > index ec8dba7..2fa1199 100644
> > --- a/lib/librte_eal/common/include/rte_log.h
> > +++ b/lib/librte_eal/common/include/rte_log.h
> > @@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
> >  #define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
> >  #define RTE_LOGTYPE_EFD       18 /**< Log related to EFD. */
> >  #define RTE_LOGTYPE_EVENTDEV  19 /**< Log related to eventdev. */
> > +#define RTE_LOGTYPE_GSO       20 /**< Log related to GSO. */
> > 
> >  /* these log types can be used in an application */
> >  #define RTE_LOGTYPE_USER1     24 /**< User-defined log type 1. */
> > diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
> > index aeaacbc..0f8e38f 100644
> > --- a/lib/librte_gso/Makefile
> > +++ b/lib/librte_gso/Makefile
> > @@ -42,6 +42,8 @@ LIBABIVER := 1
> > 
> >  #source files
> >  SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
> > +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
> > +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c
> > 
> >  # install this header file
> >  SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
> > diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
> > new file mode 100644
> > index 0000000..2b54fbd
> > --- /dev/null
> > +++ b/lib/librte_gso/gso_common.c
> > @@ -0,0 +1,270 @@
> > +/*-
> > + *   BSD LICENSE
> > + *
> > + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
> > + *   All rights reserved.
> > + *
> > + *   Redistribution and use in source and binary forms, with or without
> > + *   modification, are permitted provided that the following conditions
> > + *   are met:
> > + *
> > + *     * Redistributions of source code must retain the above copyright
> > + *       notice, this list of conditions and the following disclaimer.
> > + *     * Redistributions in binary form must reproduce the above copyright
> > + *       notice, this list of conditions and the following disclaimer in
> > + *       the documentation and/or other materials provided with the
> > + *       distribution.
> > + *     * Neither the name of Intel Corporation nor the names of its
> > + *       contributors may be used to endorse or promote products derived
> > + *       from this software without specific prior written permission.
> > + *
> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> > + */
> > +
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#include <rte_malloc.h>
> > +
> > +#include <rte_ether.h>
> > +#include <rte_ip.h>
> > +#include <rte_tcp.h>
> > +
> > +#include "gso_common.h"
> > +
> > +static inline void
> > +hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
> > +		uint16_t pkt_hdr_offset)
> > +{
> > +	/* copy mbuf metadata */
> > +	hdr_segment->nb_segs = 1;
> > +	hdr_segment->port = pkt->port;
> > +	hdr_segment->ol_flags = pkt->ol_flags;
> > +	hdr_segment->packet_type = pkt->packet_type;
> > +	hdr_segment->pkt_len = pkt_hdr_offset;
> > +	hdr_segment->data_len = pkt_hdr_offset;
> > +	hdr_segment->tx_offload = pkt->tx_offload;
> > +	/* copy packet header */
> > +	rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
> > +			rte_pktmbuf_mtod(pkt, char *),
> > +			pkt_hdr_offset);
> > +}
> > +
> > +static inline void
> > +free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
> > +{
> > +	uint16_t i;
> > +
> > +	for (i = 0; i < nb_pkts; i++) {
> > +		rte_pktmbuf_detach(pkts[i]->next);
> 
> I don't think you need to call detach() here explicitly.
> Just rte_pktmbuf_free(pkts[i]) should do I think.

Yes, rte_pktmbuf_free() is enough. I will modify it. Thanks.

> 
> > +		rte_pktmbuf_free(pkts[i]);
> > +		pkts[i] = NULL;
> > +	}
> > +}
> > +
> > +int
> > +gso_do_segment(struct rte_mbuf *pkt,
> > +		uint16_t pkt_hdr_offset,
> > +		uint16_t pyld_unit_size,
> > +		struct rte_mempool *direct_pool,
> > +		struct rte_mempool *indirect_pool,
> > +		struct rte_mbuf **pkts_out,
> > +		uint16_t nb_pkts_out)
> > +{
> > +	struct rte_mbuf *pkt_in;
> > +	struct rte_mbuf *hdr_segment, *pyld_segment;
> > +	uint32_t pkt_in_pyld_off;
> > +	uint16_t pkt_in_segment_len, pkt_out_segment_len;
> > +	uint16_t nb_segs;
> > +	bool pkt_in_segment_processed;
> > +
> > +	pkt_in_pyld_off = pkt->data_off + pkt_hdr_offset;
> > +	pkt_in = pkt;
> > +	nb_segs = 0;
> > +
> > +	while (pkt_in) {
> > +		pkt_in_segment_processed = false;
> > +		pkt_in_segment_len = pkt_in->data_off + pkt_in->data_len;
> > +
> > +		while (!pkt_in_segment_processed) {
> > +			if (unlikely(nb_segs >= nb_pkts_out)) {
> > +				free_gso_segment(pkts_out, nb_segs);
> > +				return -EINVAL;
> > +			}
> > +
> > +			/* allocate direct mbuf */
> > +			hdr_segment = rte_pktmbuf_alloc(direct_pool);
> > +			if (unlikely(hdr_segment == NULL)) {
> > +				free_gso_segment(pkts_out, nb_segs);
> > +				return -ENOMEM;
> > +			}
> > +
> > +			/* allocate indirect mbuf */
> > +			pyld_segment = rte_pktmbuf_alloc(indirect_pool);
> > +			if (unlikely(pyld_segment == NULL)) {
> > +				rte_pktmbuf_free(hdr_segment);
> > +				free_gso_segment(pkts_out, nb_segs);
> > +				return -ENOMEM;
> > +			}
> 
> So, if I understand correctly each new packet would always contain just one data segment?
> Why several data segments couldn't be chained together (if sum of their data_len <= mss)?
> In a same way as done here:
> http://dpdk.org/browse/dpdk/tree/lib/librte_ip_frag/rte_ipv4_fragmentation.c#n93
> or here:
> https://gerrit.fd.io/r/gitweb?p=tldk.git;a=blob;f=lib/libtle_l4p/tcp_tx_seg.h;h=a8d2425597a7ad6f598aa4bb7fcd7f1da74305f0;hb=HEAD#l23
> ?

Oh, yes. I can chain these data segments when their total length is less than the GSO segsz.
I will change it in the next patch. Thanks very much.

> 
> > +
> > +			/* copy packet header */
> > +			hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
> > +
> > +			/* attach payload mbuf to current packet segment */
> > +			rte_pktmbuf_attach(pyld_segment, pkt_in);
> > +
> > +			hdr_segment->next = pyld_segment;
> > +			pkts_out[nb_segs++] = hdr_segment;
> > +
> > +			/* calculate payload length */
> > +			pkt_out_segment_len = pyld_unit_size;
> > +			if (pkt_in_pyld_off + pkt_out_segment_len >
> > +					pkt_in_segment_len) {
> > +				pkt_out_segment_len = pkt_in_segment_len -
> > +					pkt_in_pyld_off;
> > +			}
> > +
> > +			/* update payload segment */
> > +			pyld_segment->data_off = pkt_in_pyld_off;
> > +			pyld_segment->data_len = pkt_out_segment_len;
> > +
> > +			/* update header segment */
> > +			hdr_segment->pkt_len += pyld_segment->data_len;
> > +			hdr_segment->nb_segs++;
> > +
> > +			/* update pkt_in_pyld_off */
> > +			pkt_in_pyld_off += pkt_out_segment_len;
> > +			if (pkt_in_pyld_off == pkt_in_segment_len)
> > +				pkt_in_segment_processed = true;
> > +		}
> > +
> > +		/* 'pkt_in' may contain numerous segments */
> > +		pkt_in = pkt_in->next;
> > +		if (pkt_in != NULL)
> > +			pkt_in_pyld_off = pkt_in->data_off;
> > +	}
> > +	return nb_segs;
> > +}
> > +
> > +static inline void
> > +parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
> > +{
> > +	struct tcp_hdr *tcp_hdr;
> > +
> > +	switch (ipv4_hdr->next_proto_id) {
> > +	case IPPROTO_TCP:
> > +		pkt->packet_type |= RTE_PTYPE_L4_TCP;
> > +		pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
> > +		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
> > +		pkt->l4_len = TCP_HDR_LEN(tcp_hdr);
> > +		break;
> > +	}
> > +}
> > +
> > +static inline void
> > +parse_ethernet(struct ether_hdr *eth_hdr, struct rte_mbuf *pkt)
> > +{
> > +	struct ipv4_hdr *ipv4_hdr;
> > +	struct vlan_hdr *vlan_hdr;
> > +	uint16_t ethertype;
> > +
> > +	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
> > +	if (ethertype == ETHER_TYPE_VLAN) {
> > +		vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
> > +		pkt->l2_len = sizeof(struct vlan_hdr);
> > +		pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
> > +		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
> > +	}
> > +
> > +	switch (ethertype) {
> > +	case ETHER_TYPE_IPv4:
> > +		if (IS_VLAN_PKT(pkt)) {
> > +			pkt->packet_type |= RTE_PTYPE_L3_IPV4;
> > +		} else {
> > +			pkt->packet_type |= RTE_PTYPE_L2_ETHER;
> > +			pkt->packet_type |= RTE_PTYPE_L3_IPV4;
> > +		}
> > +		pkt->l2_len += sizeof(struct ether_hdr);
> > +		ipv4_hdr = (struct ipv4_hdr *) ((char *)eth_hdr +
> > +				pkt->l2_len);
> > +		parse_ipv4(ipv4_hdr, pkt);
> > +		break;
> > +	}
> > +}
> > +
> > +void
> > +gso_parse_packet(struct rte_mbuf *pkt)
> 
> There is a function rte_net_get_ptype() that supposed to provide similar functionality.
> So we probably don't need to create a new SW parse function here, instead would be better
> to reuse (and update if needed) an existing one.
> Again user already might have l2/l3/l4.../_len and packet_type setuped.
> So better to keep SW packet parsing out of scope of that library. 

Hmm, I know we have discussed this design choice in the GRO library, and I also think it's
better to reuse these values.

But from the perspective of OVS, it may add extra overhead, since OVS doesn't parse every
packet originally. Maybe @Mark can give us more inputs from the view of OVS.

> 
> > +{
> > +	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> > +
> > +	pkt->packet_type = pkt->tx_offload = 0;
> > +	parse_ethernet(eth_hdr, pkt);
> > +}
> > +
> > +static inline void
> > +update_ipv4_header(char *base, uint16_t offset, uint16_t length, uint16_t id)
> > +{
> > +	struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)(base + offset);
> > +
> > +	ipv4_hdr->total_length = rte_cpu_to_be_16(length - offset);
> > +	ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
> > +}
> > +
> > +static inline void
> > +update_tcp_header(char *base, uint16_t offset, uint32_t sent_seq,
> > +	uint8_t non_tail)
> > +{
> > +	struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)(base + offset);
> > +
> > +	tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
> > +	/* clean FIN and PSH for non-tail segments */
> > +	if (non_tail)
> > +		tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK | TCP_HDR_FIN_MASK));
> > +}
> > +
> > +void
> > +gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
> > +		struct rte_mbuf **out_segments)
> > +{
> > +	struct ipv4_hdr *ipv4_hdr;
> > +	struct tcp_hdr *tcp_hdr;
> > +	struct rte_mbuf *seg;
> > +	uint32_t sent_seq;
> > +	uint16_t offset, i;
> > +	uint16_t tail_seg_idx = nb_segments - 1, id;
> > +
> > +	switch (pkt->packet_type) {
> > +	case ETHER_VLAN_IPv4_TCP_PKT:
> > +	case ETHER_IPv4_TCP_PKT:
> 
> Might be worth to put code below in a separate function:
> update_inner_tcp_hdr(..) or so.
> Then you can reuse it for tunneled cases too.

Yes, I will modify it in the next patch. Thanks.

> 
> > +		ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) 
> > +				pkt->l2_len);
> > +		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
> > +		id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
> > +		sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
> > +
> > +		for (i = 0; i < nb_segments; i++) {
> > +			seg = out_segments[i];
> > +
> > +			offset = seg->l2_len;
> > +			update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
> > +					offset, seg->pkt_len, id);
> > +			id++;
> 
> Who would be responsible to make sure that we wouldn't have consecutive packets with the IPV4 id?
> Would be the upper layer that forms the packet or gso library or ...?

Oh yes. I ingore this important issue. I don't think applications can guarantee it.
I will check the design of linux and try to figure out a way. Thanks for reminder.

> 
> > +
> > +			offset += seg->l3_len;
> > +			update_tcp_header(rte_pktmbuf_mtod(seg, char *),
> > +					offset, sent_seq, i < tail_seg_idx);
> > +			sent_seq += seg->next->data_len;
> > +		}
> > +		break;
> > +	}
> > +}
> > diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
> > new file mode 100644
> > index 0000000..d750041
> > --- /dev/null
> > +++ b/lib/librte_gso/gso_common.h
> > @@ -0,0 +1,120 @@
> > +/*-
> > + *   BSD LICENSE
> > + *
> > + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
> > + *   All rights reserved.
> > + *
> > + *   Redistribution and use in source and binary forms, with or without
> > + *   modification, are permitted provided that the following conditions
> > + *   are met:
> > + *
> > + *     * Redistributions of source code must retain the above copyright
> > + *       notice, this list of conditions and the following disclaimer.
> > + *     * Redistributions in binary form must reproduce the above copyright
> > + *       notice, this list of conditions and the following disclaimer in
> > + *       the documentation and/or other materials provided with the
> > + *       distribution.
> > + *     * Neither the name of Intel Corporation nor the names of its
> > + *       contributors may be used to endorse or promote products derived
> > + *       from this software without specific prior written permission.
> > + *
> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> > + */
> > +
> > +#ifndef _GSO_COMMON_H_
> > +#define _GSO_COMMON_H_
> > +
> > +#include <stdint.h>
> > +#include <rte_mbuf.h>
> > +
> > +#define IPV4_HDR_DF_SHIFT 14
> > +#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
> > +#define IPv4_HDR_LEN(iph) ((iph->version_ihl & 0x0f) * 4)
> > +
> > +#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
> > +#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
> > +#define TCP_HDR_LEN(tcph) ((tcph->data_off & 0xf0) >> 2)
> > +
> > +#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
> > +/* Supported packet types */
> > +/* TCP/IPv4 packet. */
> > +#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
> > +
> > +/* TCP/IPv4 packet with VLAN tag. */
> > +#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
> > +		RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
> > +
> > +#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == \
> > +		RTE_PTYPE_L2_ETHER_VLAN)
> > +
> > +/**
> > + * Internal function which parses a packet, setting outer_l2/l3_len and
> > + * l2/l3/l4_len and packet_type.
> > + *
> > + * @param pkt
> > + *  Packet to parse.
> > + */
> > +void gso_parse_packet(struct rte_mbuf *pkt);
> > +
> > +/**
> > + * Internal function which updates relevant packet headers, following
> > + * segmentation. This is required to update, for example, the IPv4
> > + * 'total_length' field, to reflect the reduced length of the now-
> > + * segmented packet.
> > + *
> > + * @param pkt
> > + *  The original packet.
> > + * @param nb_segments
> > + *  The number of GSO segments into which pkt was split.
> > + * @param out_segements
> > + *  Pointer array used for storing mbuf addresses for GSO segments.
> > + */
> > +void gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
> > +		struct rte_mbuf **out_segments);
> > +
> > +/**
> > + * Internal function which divides the input packet into small segments.
> > + * Each of the newly-created segments is organized as a two-segment mbuf,
> > + * where the first segment is a standard mbuf, which stores a copy of
> > + * packet header, and the second is an indirect mbuf which points to a
> > + * section of data in the input packet.
> > + *
> > + * @param pkt
> > + *  Packet to segment.
> > + * @param pkt_hdr_offset
> > + *  Packet header offset, measured in byte.
> > + * @param pyld_unit_size
> > + *  The max payload length of a GSO segment.
> > + * @param direct_pool
> > + *  MBUF pool used for allocating direct buffers for output segments.
> > + * @param indirect_pool
> > + *  MBUF pool used for allocating indirect buffers for output segments.
> > + * @param pkts_out
> > + *  Pointer array used to keep the mbuf addresses of output segments.
> > + * @param nb_pkts_out
> > + *  The max number of items that pkts_out can keep.
> > + *
> > + * @return
> > + *  - The number of segments created in the event of success.
> > + *  - If no GSO is performed, return 1.
> > + *  - If available memory in mempools is insufficient, return -ENOMEM.
> > + *  - -EINVAL for invalid parameters
> > + */
> > +int gso_do_segment(struct rte_mbuf *pkt,
> > +		uint16_t pkt_hdr_offset,
> > +		uint16_t pyld_unit_size,
> > +		struct rte_mempool *direct_pool,
> > +		struct rte_mempool *indirect_pool,
> > +		struct rte_mbuf **pkts_out,
> > +		uint16_t nb_pkts_out);
> > +#endif
> > diff --git a/lib/librte_gso/gso_tcp.c b/lib/librte_gso/gso_tcp.c
> > new file mode 100644
> > index 0000000..9d5fc30
> > --- /dev/null
> > +++ b/lib/librte_gso/gso_tcp.c
> > @@ -0,0 +1,82 @@
> > +/*-
> > + *   BSD LICENSE
> > + *
> > + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
> > + *   All rights reserved.
> > + *
> > + *   Redistribution and use in source and binary forms, with or without
> > + *   modification, are permitted provided that the following conditions
> > + *   are met:
> > + *
> > + *     * Redistributions of source code must retain the above copyright
> > + *       notice, this list of conditions and the following disclaimer.
> > + *     * Redistributions in binary form must reproduce the above copyright
> > + *       notice, this list of conditions and the following disclaimer in
> > + *       the documentation and/or other materials provided with the
> > + *       distribution.
> > + *     * Neither the name of Intel Corporation nor the names of its
> > + *       contributors may be used to endorse or promote products derived
> > + *       from this software without specific prior written permission.
> > + *
> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> > + */
> > +
> > +
> > +#include <rte_ether.h>
> > +#include <rte_ip.h>
> > +#include <rte_tcp.h>
> > +
> > +#include "gso_common.h"
> > +#include "gso_tcp.h"
> > +
> > +int
> > +gso_tcp4_segment(struct rte_mbuf *pkt,
> > +		uint16_t gso_size,
> > +		struct rte_mempool *direct_pool,
> > +		struct rte_mempool *indirect_pool,
> > +		struct rte_mbuf **pkts_out,
> > +		uint16_t nb_pkts_out)
> > +{
> > +	struct ether_hdr *eth_hdr;
> > +	struct ipv4_hdr *ipv4_hdr;
> > +	uint16_t tcp_dl;
> > +	uint16_t pyld_unit_size;
> > +	uint16_t hdr_offset;
> > +	int ret = 1;
> > +
> > +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> > +	ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len);
> > +
> > +	/* don't process fragmented packet */
> > +	if ((ipv4_hdr->fragment_offset &
> > +				rte_cpu_to_be_16(IPV4_HDR_DF_MASK)) == 0)
> > +		return ret;
> > +
> > +	tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) -
> > +		pkt->l3_len - pkt->l4_len;
> > +	/* don't process packet without data */
> > +	if (tcp_dl == 0)
> > +		return ret;
> > +
> > +	hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
> > +	pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
> > +
> > +	/* segment the payload */
> > +	ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
> > +			indirect_pool, pkts_out, nb_pkts_out);
> > +
> > +	if (ret > 1)
> > +		gso_update_pkt_headers(pkt, ret, pkts_out);
> > +
> > +	return ret;
> > +}
> > diff --git a/lib/librte_gso/gso_tcp.h b/lib/librte_gso/gso_tcp.h
> > new file mode 100644
> > index 0000000..f291ccb
> > --- /dev/null
> > +++ b/lib/librte_gso/gso_tcp.h
> > @@ -0,0 +1,73 @@
> > +/*-
> > + *   BSD LICENSE
> > + *
> > + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
> > + *   All rights reserved.
> > + *
> > + *   Redistribution and use in source and binary forms, with or without
> > + *   modification, are permitted provided that the following conditions
> > + *   are met:
> > + *
> > + *     * Redistributions of source code must retain the above copyright
> > + *       notice, this list of conditions and the following disclaimer.
> > + *     * Redistributions in binary form must reproduce the above copyright
> > + *       notice, this list of conditions and the following disclaimer in
> > + *       the documentation and/or other materials provided with the
> > + *       distribution.
> > + *     * Neither the name of Intel Corporation nor the names of its
> > + *       contributors may be used to endorse or promote products derived
> > + *       from this software without specific prior written permission.
> > + *
> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> > + */
> > +
> > +#ifndef _GSO_TCP_H_
> > +#define _GSO_TCP_H_
> > +
> > +#include <stdint.h>
> > +#include <rte_mbuf.h>
> > +
> > +/**
> > + * Segment an IPv4/TCP packet. This function assumes the input packet has
> > + * correct checksums and doesn't update checksums for GSO segment.
> > + * Furthermore, it doesn't process IP fragment packets.
> > + *
> > + * @param pkt
> > + *  The packet mbuf to segment.
> > + * @param gso_size
> > + *  The max length of a GSO segment, measured in bytes.
> > + * @param direct_pool
> > + *  MBUF pool used for allocating direct buffers for output segments.
> > + * @param indirect_pool
> > + *  MBUF pool used for allocating indirect buffers for output segments.
> > + * @param pkts_out
> > + *  Pointer array, which is used to store mbuf addresses of GSO segments.
> > + *  Caller should guarantee that 'pkts_out' is sufficiently large to store
> > + *  all GSO segments.
> > + * @param nb_pkts_out
> > + *  The max number of items that 'pkts_out' can keep.
> > + *
> > + * @return
> > + *   - The number of GSO segments on success.
> > + *   - Return 1 if no GSO is performed.
> > + *   - Return -ENOMEM if available memory in mempools is insufficient.
> > + *   - Return -EINVAL for invalid parameters.
> > + */
> > +int gso_tcp4_segment(struct rte_mbuf *pkt,
> > +		uint16_t gso_size,
> > +		struct rte_mempool *direct_pool,
> > +		struct rte_mempool *indirect_pool,
> > +		struct rte_mbuf **pkts_out,
> > +		uint16_t nb_pkts_out);
> > +
> > +#endif
> > diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
> > index b81afce..fac95f2 100644
> > --- a/lib/librte_gso/rte_gso.c
> > +++ b/lib/librte_gso/rte_gso.c
> > @@ -31,17 +31,57 @@
> >   *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> >   */
> > 
> > +#include <rte_log.h>
> > +
> >  #include "rte_gso.h"
> > +#include "gso_common.h"
> > +#include "gso_tcp.h"
> > 
> >  int
> >  rte_gso_segment(struct rte_mbuf *pkt,
> >  		struct rte_gso_ctx gso_ctx,
> >  		struct rte_mbuf **pkts_out,
> > -		uint16_t nb_pkts_out __rte_unused)
> > +		uint16_t nb_pkts_out)
> >  {
> > +	struct rte_mempool *direct_pool, *indirect_pool;
> > +	struct rte_mbuf *pkt_seg;
> > +	uint16_t nb_segments, gso_size;
> > +
> >  	if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
> >  			NULL || gso_ctx.indirect_pool == NULL)
> >  		return -EINVAL;
> 
> Probably we don't need to check gso_ctx values for each incoming packet.
> If you feel it is necessary - create  new function rte_gso_ctx_check() that
> could be performed just once per ctx.

Agree. I will change it. Thanks.

> 
> > 
> > -	return 1;
> > +	if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
> > +			gso_ctx.gso_size >= pkt->pkt_len ||
> > +			gso_ctx.gso_size == 0)
> 
> 
> First and third condition seems redundant.

The reason to check gso_ctx.gso_types here is that we don't perform
GSO if applications don't set RTE_GSO_TCP_IPV4 to gso_ctx.gso_types,
even the input packet is TCP/IPv4. And if gso_ctx.gso_size is 0,
we don't need to execute the following codes. So we still need to
remove these two conditions?

> 
> > +		return 1;
> 
> 
> I think you forgot here:
> pkts_out[0] = pkt;

But why should we keep the input packet in the output array? Currently, if
GSO is not performed, no packets will be kept in pkts_out[]. Applications
can know it by the return value 1 of rte_gso_segment().

> 
> 
> > +
> > +	pkt_seg = pkt;
> > +	gso_size = gso_ctx.gso_size;
> > +	direct_pool = gso_ctx.direct_pool;
> > +	indirect_pool = gso_ctx.indirect_pool;
> > +
> > +	/* Parse packet headers to determine how to segment 'pkt' */
> > +	gso_parse_packet(pkt);
> 
> 
> I don't think we need to parse packet here.
>  Instead assume that user already filled packet_type and l2/l3/..._len fields correctly.

Hmm, I see it. Thanks.

Thanks,
Jiayu
  
Hu, Jiayu Aug. 30, 2017, 9:03 a.m. UTC | #3
Hi Konstantin,

On Wed, Aug 30, 2017 at 09:38:33AM +0800, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: Hu, Jiayu
> > Sent: Thursday, August 24, 2017 3:16 PM
> > To: dev@dpdk.org
> > Cc: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng
> > <jianfeng.tan@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
> > Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> > 
> > This patch adds GSO support for TCP/IPv4 packets. Supported packets
> > may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
> > packets have correct checksums, and doesn't update checksums for output
> > packets (the responsibility for this lies with the application).
> > Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
> > 
> > TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
> > MBUF, to organize an output packet. Note that we refer to these two
> > chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
> > header, while the indirect mbuf simply points to a location within the
> > original packet's payload. Consequently, use of the GSO library requires
> > multi-segment MBUF support in the TX functions of the NIC driver.
> > 
> > If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
> > result, when all of its GSOed segments are freed, the packet is freed
> > automatically.
> > 
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> > ---
> >  int
> >  rte_gso_segment(struct rte_mbuf *pkt,
> >  		struct rte_gso_ctx gso_ctx,
> >  		struct rte_mbuf **pkts_out,
> > -		uint16_t nb_pkts_out __rte_unused)
> > +		uint16_t nb_pkts_out)
> >  {
> > +	struct rte_mempool *direct_pool, *indirect_pool;
> > +	struct rte_mbuf *pkt_seg;
> > +	uint16_t nb_segments, gso_size;
> > +
> >  	if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
> >  			NULL || gso_ctx.indirect_pool == NULL)
> >  		return -EINVAL;
> 
> Probably we don't need to check gso_ctx values for each incoming packet.
> If you feel it is necessary - create  new function rte_gso_ctx_check() that
> could be performed just once per ctx.
> 
> > 
> > -	return 1;
> > +	if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
> > +			gso_ctx.gso_size >= pkt->pkt_len ||
> > +			gso_ctx.gso_size == 0)
> 
> 
> First and third condition seems redundant.

Yes, we don't need the first and the third check here. Please ingore the redundant
reply in the previous mail.

Thanks,
Jiayu
  
Mark Kavanagh Aug. 30, 2017, 9:25 a.m. UTC | #4
>From: Hu, Jiayu
>Sent: Wednesday, August 30, 2017 3:56 AM
>To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
>Cc: dev@dpdk.org; Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Tan, Jianfeng
><jianfeng.tan@intel.com>
>Subject: Re: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
>
>Hi Konstantin,
>
>Thanks for your important suggestions. My feedbacks are inline.
>
>On Wed, Aug 30, 2017 at 09:38:33AM +0800, Ananyev, Konstantin wrote:
>>
>>
>> > -----Original Message-----
>> > From: Hu, Jiayu
>> > Sent: Thursday, August 24, 2017 3:16 PM
>> > To: dev@dpdk.org
>> > Cc: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Ananyev, Konstantin
><konstantin.ananyev@intel.com>; Tan, Jianfeng
>> > <jianfeng.tan@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
>> > Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
>> >
>> > This patch adds GSO support for TCP/IPv4 packets. Supported packets
>> > may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
>> > packets have correct checksums, and doesn't update checksums for output
>> > packets (the responsibility for this lies with the application).
>> > Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
>> >
>> > TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
>> > MBUF, to organize an output packet. Note that we refer to these two
>> > chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
>> > header, while the indirect mbuf simply points to a location within the
>> > original packet's payload. Consequently, use of the GSO library requires
>> > multi-segment MBUF support in the TX functions of the NIC driver.
>> >
>> > If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
>> > result, when all of its GSOed segments are freed, the packet is freed
>> > automatically.
>> >
>> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
>> > Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>> > ---
>> >  lib/librte_eal/common/include/rte_log.h |   1 +
>> >  lib/librte_gso/Makefile                 |   2 +
>> >  lib/librte_gso/gso_common.c             | 270
>++++++++++++++++++++++++++++++++
>> >  lib/librte_gso/gso_common.h             | 120 ++++++++++++++
>> >  lib/librte_gso/gso_tcp.c                |  82 ++++++++++
>> >  lib/librte_gso/gso_tcp.h                |  73 +++++++++
>> >  lib/librte_gso/rte_gso.c                |  44 +++++-
>> >  lib/librte_gso/rte_gso.h                |   3 +
>> >  8 files changed, 593 insertions(+), 2 deletions(-)
>> >  create mode 100644 lib/librte_gso/gso_common.c
>> >  create mode 100644 lib/librte_gso/gso_common.h
>> >  create mode 100644 lib/librte_gso/gso_tcp.c
>> >  create mode 100644 lib/librte_gso/gso_tcp.h
>> >
>> > diff --git a/lib/librte_eal/common/include/rte_log.h
>b/lib/librte_eal/common/include/rte_log.h
>> > index ec8dba7..2fa1199 100644
>> > --- a/lib/librte_eal/common/include/rte_log.h
>> > +++ b/lib/librte_eal/common/include/rte_log.h
>> > @@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
>> >  #define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
>> >  #define RTE_LOGTYPE_EFD       18 /**< Log related to EFD. */
>> >  #define RTE_LOGTYPE_EVENTDEV  19 /**< Log related to eventdev. */
>> > +#define RTE_LOGTYPE_GSO       20 /**< Log related to GSO. */
>> >
>> >  /* these log types can be used in an application */
>> >  #define RTE_LOGTYPE_USER1     24 /**< User-defined log type 1. */
>> > diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
>> > index aeaacbc..0f8e38f 100644
>> > --- a/lib/librte_gso/Makefile
>> > +++ b/lib/librte_gso/Makefile
>> > @@ -42,6 +42,8 @@ LIBABIVER := 1
>> >
>> >  #source files
>> >  SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
>> > +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
>> > +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c
>> >
>> >  # install this header file
>> >  SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
>> > diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
>> > new file mode 100644
>> > index 0000000..2b54fbd
>> > --- /dev/null
>> > +++ b/lib/librte_gso/gso_common.c
>> > @@ -0,0 +1,270 @@
>> > +/*-
>> > + *   BSD LICENSE
>> > + *
>> > + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
>> > + *   All rights reserved.
>> > + *
>> > + *   Redistribution and use in source and binary forms, with or without
>> > + *   modification, are permitted provided that the following conditions
>> > + *   are met:
>> > + *
>> > + *     * Redistributions of source code must retain the above copyright
>> > + *       notice, this list of conditions and the following disclaimer.
>> > + *     * Redistributions in binary form must reproduce the above
>copyright
>> > + *       notice, this list of conditions and the following disclaimer in
>> > + *       the documentation and/or other materials provided with the
>> > + *       distribution.
>> > + *     * Neither the name of Intel Corporation nor the names of its
>> > + *       contributors may be used to endorse or promote products derived
>> > + *       from this software without specific prior written permission.
>> > + *
>> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
>FOR
>> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
>> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
>INCIDENTAL,
>> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
>> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
>USE,
>> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
>ANY
>> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
>> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
>USE
>> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> > + */
>> > +
>> > +#include <stdbool.h>
>> > +#include <string.h>
>> > +
>> > +#include <rte_malloc.h>
>> > +
>> > +#include <rte_ether.h>
>> > +#include <rte_ip.h>
>> > +#include <rte_tcp.h>
>> > +
>> > +#include "gso_common.h"
>> > +
>> > +static inline void
>> > +hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
>> > +		uint16_t pkt_hdr_offset)
>> > +{
>> > +	/* copy mbuf metadata */
>> > +	hdr_segment->nb_segs = 1;
>> > +	hdr_segment->port = pkt->port;
>> > +	hdr_segment->ol_flags = pkt->ol_flags;
>> > +	hdr_segment->packet_type = pkt->packet_type;
>> > +	hdr_segment->pkt_len = pkt_hdr_offset;
>> > +	hdr_segment->data_len = pkt_hdr_offset;
>> > +	hdr_segment->tx_offload = pkt->tx_offload;
>> > +	/* copy packet header */
>> > +	rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
>> > +			rte_pktmbuf_mtod(pkt, char *),
>> > +			pkt_hdr_offset);
>> > +}
>> > +
>> > +static inline void
>> > +free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
>> > +{
>> > +	uint16_t i;
>> > +
>> > +	for (i = 0; i < nb_pkts; i++) {
>> > +		rte_pktmbuf_detach(pkts[i]->next);
>>
>> I don't think you need to call detach() here explicitly.
>> Just rte_pktmbuf_free(pkts[i]) should do I think.
>
>Yes, rte_pktmbuf_free() is enough. I will modify it. Thanks.
>
>>
>> > +		rte_pktmbuf_free(pkts[i]);
>> > +		pkts[i] = NULL;
>> > +	}
>> > +}
>> > +
>> > +int
>> > +gso_do_segment(struct rte_mbuf *pkt,
>> > +		uint16_t pkt_hdr_offset,
>> > +		uint16_t pyld_unit_size,
>> > +		struct rte_mempool *direct_pool,
>> > +		struct rte_mempool *indirect_pool,
>> > +		struct rte_mbuf **pkts_out,
>> > +		uint16_t nb_pkts_out)
>> > +{
>> > +	struct rte_mbuf *pkt_in;
>> > +	struct rte_mbuf *hdr_segment, *pyld_segment;
>> > +	uint32_t pkt_in_pyld_off;
>> > +	uint16_t pkt_in_segment_len, pkt_out_segment_len;
>> > +	uint16_t nb_segs;
>> > +	bool pkt_in_segment_processed;
>> > +
>> > +	pkt_in_pyld_off = pkt->data_off + pkt_hdr_offset;
>> > +	pkt_in = pkt;
>> > +	nb_segs = 0;
>> > +
>> > +	while (pkt_in) {
>> > +		pkt_in_segment_processed = false;
>> > +		pkt_in_segment_len = pkt_in->data_off + pkt_in->data_len;
>> > +
>> > +		while (!pkt_in_segment_processed) {
>> > +			if (unlikely(nb_segs >= nb_pkts_out)) {
>> > +				free_gso_segment(pkts_out, nb_segs);
>> > +				return -EINVAL;
>> > +			}
>> > +
>> > +			/* allocate direct mbuf */
>> > +			hdr_segment = rte_pktmbuf_alloc(direct_pool);
>> > +			if (unlikely(hdr_segment == NULL)) {
>> > +				free_gso_segment(pkts_out, nb_segs);
>> > +				return -ENOMEM;
>> > +			}
>> > +
>> > +			/* allocate indirect mbuf */
>> > +			pyld_segment = rte_pktmbuf_alloc(indirect_pool);
>> > +			if (unlikely(pyld_segment == NULL)) {
>> > +				rte_pktmbuf_free(hdr_segment);
>> > +				free_gso_segment(pkts_out, nb_segs);
>> > +				return -ENOMEM;
>> > +			}
>>
>> So, if I understand correctly each new packet would always contain just one
>data segment?
>> Why several data segments couldn't be chained together (if sum of their
>data_len <= mss)?
>> In a same way as done here:
>>
>http://dpdk.org/browse/dpdk/tree/lib/librte_ip_frag/rte_ipv4_fragmentation.c#n
>93
>> or here:
>>
>https://gerrit.fd.io/r/gitweb?p=tldk.git;a=blob;f=lib/libtle_l4p/tcp_tx_seg.h;
>h=a8d2425597a7ad6f598aa4bb7fcd7f1da74305f0;hb=HEAD#l23
>> ?
>
>Oh, yes. I can chain these data segments when their total length is less than
>the GSO segsz.
>I will change it in the next patch. Thanks very much.
>
>>
>> > +
>> > +			/* copy packet header */
>> > +			hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
>> > +
>> > +			/* attach payload mbuf to current packet segment */
>> > +			rte_pktmbuf_attach(pyld_segment, pkt_in);
>> > +
>> > +			hdr_segment->next = pyld_segment;
>> > +			pkts_out[nb_segs++] = hdr_segment;
>> > +
>> > +			/* calculate payload length */
>> > +			pkt_out_segment_len = pyld_unit_size;
>> > +			if (pkt_in_pyld_off + pkt_out_segment_len >
>> > +					pkt_in_segment_len) {
>> > +				pkt_out_segment_len = pkt_in_segment_len -
>> > +					pkt_in_pyld_off;
>> > +			}
>> > +
>> > +			/* update payload segment */
>> > +			pyld_segment->data_off = pkt_in_pyld_off;
>> > +			pyld_segment->data_len = pkt_out_segment_len;
>> > +
>> > +			/* update header segment */
>> > +			hdr_segment->pkt_len += pyld_segment->data_len;
>> > +			hdr_segment->nb_segs++;
>> > +
>> > +			/* update pkt_in_pyld_off */
>> > +			pkt_in_pyld_off += pkt_out_segment_len;
>> > +			if (pkt_in_pyld_off == pkt_in_segment_len)
>> > +				pkt_in_segment_processed = true;
>> > +		}
>> > +
>> > +		/* 'pkt_in' may contain numerous segments */
>> > +		pkt_in = pkt_in->next;
>> > +		if (pkt_in != NULL)
>> > +			pkt_in_pyld_off = pkt_in->data_off;
>> > +	}
>> > +	return nb_segs;
>> > +}
>> > +
>> > +static inline void
>> > +parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
>> > +{
>> > +	struct tcp_hdr *tcp_hdr;
>> > +
>> > +	switch (ipv4_hdr->next_proto_id) {
>> > +	case IPPROTO_TCP:
>> > +		pkt->packet_type |= RTE_PTYPE_L4_TCP;
>> > +		pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
>> > +		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
>> > +		pkt->l4_len = TCP_HDR_LEN(tcp_hdr);
>> > +		break;
>> > +	}
>> > +}
>> > +
>> > +static inline void
>> > +parse_ethernet(struct ether_hdr *eth_hdr, struct rte_mbuf *pkt)
>> > +{
>> > +	struct ipv4_hdr *ipv4_hdr;
>> > +	struct vlan_hdr *vlan_hdr;
>> > +	uint16_t ethertype;
>> > +
>> > +	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
>> > +	if (ethertype == ETHER_TYPE_VLAN) {
>> > +		vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
>> > +		pkt->l2_len = sizeof(struct vlan_hdr);
>> > +		pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
>> > +		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
>> > +	}
>> > +
>> > +	switch (ethertype) {
>> > +	case ETHER_TYPE_IPv4:
>> > +		if (IS_VLAN_PKT(pkt)) {
>> > +			pkt->packet_type |= RTE_PTYPE_L3_IPV4;
>> > +		} else {
>> > +			pkt->packet_type |= RTE_PTYPE_L2_ETHER;
>> > +			pkt->packet_type |= RTE_PTYPE_L3_IPV4;
>> > +		}
>> > +		pkt->l2_len += sizeof(struct ether_hdr);
>> > +		ipv4_hdr = (struct ipv4_hdr *) ((char *)eth_hdr +
>> > +				pkt->l2_len);
>> > +		parse_ipv4(ipv4_hdr, pkt);
>> > +		break;
>> > +	}
>> > +}
>> > +
>> > +void
>> > +gso_parse_packet(struct rte_mbuf *pkt)
>>
>> There is a function rte_net_get_ptype() that supposed to provide similar
>functionality.
>> So we probably don't need to create a new SW parse function here, instead
>would be better
>> to reuse (and update if needed) an existing one.
>> Again user already might have l2/l3/l4.../_len and packet_type setuped.
>> So better to keep SW packet parsing out of scope of that library.
>
>Hmm, I know we have discussed this design choice in the GRO library, and I
>also think it's
>better to reuse these values.
>
>But from the perspective of OVS, it may add extra overhead, since OVS doesn't
>parse every
>packet originally. Maybe @Mark can give us more inputs from the view of OVS.

Hi Jiayu, Konstantin

For GSO, the application needs to know:
- the packet type (as it only currently supports TCP/IPv4, VxLAN, GRE packets)
- the l2/3/4_lens, etc. (in order to replicate the original packet's headers across outgoing segments)

For this, we can use the rte_net_get_ptype function, as per Konstantin's suggestion, as it provides both - thanks Konstantin!

WRT the extra overhead in OvS: TSO is the defacto standard, and GSO is provided purely as a fallback option. As such, and since the additional packet parsing is a necessity in order to facilitate GSO, the additional overhead is IMO acceptable.

Thanks,
Mark

>
>>
>> > +{
>> > +	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
>> > +
>> > +	pkt->packet_type = pkt->tx_offload = 0;
>> > +	parse_ethernet(eth_hdr, pkt);
>> > +}
>> > +
>> > +static inline void
>> > +update_ipv4_header(char *base, uint16_t offset, uint16_t length, uint16_t
>id)
>> > +{
>> > +	struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)(base + offset);
>> > +
>> > +	ipv4_hdr->total_length = rte_cpu_to_be_16(length - offset);
>> > +	ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
>> > +}
>> > +
>> > +static inline void
>> > +update_tcp_header(char *base, uint16_t offset, uint32_t sent_seq,
>> > +	uint8_t non_tail)
>> > +{
>> > +	struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)(base + offset);
>> > +
>> > +	tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
>> > +	/* clean FIN and PSH for non-tail segments */
>> > +	if (non_tail)
>> > +		tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK | TCP_HDR_FIN_MASK));
>> > +}
>> > +
>> > +void
>> > +gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
>> > +		struct rte_mbuf **out_segments)
>> > +{
>> > +	struct ipv4_hdr *ipv4_hdr;
>> > +	struct tcp_hdr *tcp_hdr;
>> > +	struct rte_mbuf *seg;
>> > +	uint32_t sent_seq;
>> > +	uint16_t offset, i;
>> > +	uint16_t tail_seg_idx = nb_segments - 1, id;
>> > +
>> > +	switch (pkt->packet_type) {
>> > +	case ETHER_VLAN_IPv4_TCP_PKT:
>> > +	case ETHER_IPv4_TCP_PKT:
>>
>> Might be worth to put code below in a separate function:
>> update_inner_tcp_hdr(..) or so.
>> Then you can reuse it for tunneled cases too.
>
>Yes, I will modify it in the next patch. Thanks.
>
>>
>> > +		ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
>> > +				pkt->l2_len);
>> > +		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
>> > +		id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
>> > +		sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
>> > +
>> > +		for (i = 0; i < nb_segments; i++) {
>> > +			seg = out_segments[i];
>> > +
>> > +			offset = seg->l2_len;
>> > +			update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
>> > +					offset, seg->pkt_len, id);
>> > +			id++;
>>
>> Who would be responsible to make sure that we wouldn't have consecutive
>packets with the IPV4 id?
>> Would be the upper layer that forms the packet or gso library or ...?
>
>Oh yes. I ingore this important issue. I don't think applications can
>guarantee it.
>I will check the design of linux and try to figure out a way. Thanks for
>reminder.
>
>>
>> > +
>> > +			offset += seg->l3_len;
>> > +			update_tcp_header(rte_pktmbuf_mtod(seg, char *),
>> > +					offset, sent_seq, i < tail_seg_idx);
>> > +			sent_seq += seg->next->data_len;
>> > +		}
>> > +		break;
>> > +	}
>> > +}
>> > diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
>> > new file mode 100644
>> > index 0000000..d750041
>> > --- /dev/null
>> > +++ b/lib/librte_gso/gso_common.h
>> > @@ -0,0 +1,120 @@
>> > +/*-
>> > + *   BSD LICENSE
>> > + *
>> > + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
>> > + *   All rights reserved.
>> > + *
>> > + *   Redistribution and use in source and binary forms, with or without
>> > + *   modification, are permitted provided that the following conditions
>> > + *   are met:
>> > + *
>> > + *     * Redistributions of source code must retain the above copyright
>> > + *       notice, this list of conditions and the following disclaimer.
>> > + *     * Redistributions in binary form must reproduce the above
>copyright
>> > + *       notice, this list of conditions and the following disclaimer in
>> > + *       the documentation and/or other materials provided with the
>> > + *       distribution.
>> > + *     * Neither the name of Intel Corporation nor the names of its
>> > + *       contributors may be used to endorse or promote products derived
>> > + *       from this software without specific prior written permission.
>> > + *
>> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
>FOR
>> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
>> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
>INCIDENTAL,
>> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
>> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
>USE,
>> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
>ANY
>> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
>> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
>USE
>> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> > + */
>> > +
>> > +#ifndef _GSO_COMMON_H_
>> > +#define _GSO_COMMON_H_
>> > +
>> > +#include <stdint.h>
>> > +#include <rte_mbuf.h>
>> > +
>> > +#define IPV4_HDR_DF_SHIFT 14
>> > +#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
>> > +#define IPv4_HDR_LEN(iph) ((iph->version_ihl & 0x0f) * 4)
>> > +
>> > +#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
>> > +#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
>> > +#define TCP_HDR_LEN(tcph) ((tcph->data_off & 0xf0) >> 2)
>> > +
>> > +#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
>> > +/* Supported packet types */
>> > +/* TCP/IPv4 packet. */
>> > +#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
>> > +
>> > +/* TCP/IPv4 packet with VLAN tag. */
>> > +#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
>> > +		RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
>> > +
>> > +#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) ==
>\
>> > +		RTE_PTYPE_L2_ETHER_VLAN)
>> > +
>> > +/**
>> > + * Internal function which parses a packet, setting outer_l2/l3_len and
>> > + * l2/l3/l4_len and packet_type.
>> > + *
>> > + * @param pkt
>> > + *  Packet to parse.
>> > + */
>> > +void gso_parse_packet(struct rte_mbuf *pkt);
>> > +
>> > +/**
>> > + * Internal function which updates relevant packet headers, following
>> > + * segmentation. This is required to update, for example, the IPv4
>> > + * 'total_length' field, to reflect the reduced length of the now-
>> > + * segmented packet.
>> > + *
>> > + * @param pkt
>> > + *  The original packet.
>> > + * @param nb_segments
>> > + *  The number of GSO segments into which pkt was split.
>> > + * @param out_segements
>> > + *  Pointer array used for storing mbuf addresses for GSO segments.
>> > + */
>> > +void gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
>> > +		struct rte_mbuf **out_segments);
>> > +
>> > +/**
>> > + * Internal function which divides the input packet into small segments.
>> > + * Each of the newly-created segments is organized as a two-segment mbuf,
>> > + * where the first segment is a standard mbuf, which stores a copy of
>> > + * packet header, and the second is an indirect mbuf which points to a
>> > + * section of data in the input packet.
>> > + *
>> > + * @param pkt
>> > + *  Packet to segment.
>> > + * @param pkt_hdr_offset
>> > + *  Packet header offset, measured in byte.
>> > + * @param pyld_unit_size
>> > + *  The max payload length of a GSO segment.
>> > + * @param direct_pool
>> > + *  MBUF pool used for allocating direct buffers for output segments.
>> > + * @param indirect_pool
>> > + *  MBUF pool used for allocating indirect buffers for output segments.
>> > + * @param pkts_out
>> > + *  Pointer array used to keep the mbuf addresses of output segments.
>> > + * @param nb_pkts_out
>> > + *  The max number of items that pkts_out can keep.
>> > + *
>> > + * @return
>> > + *  - The number of segments created in the event of success.
>> > + *  - If no GSO is performed, return 1.
>> > + *  - If available memory in mempools is insufficient, return -ENOMEM.
>> > + *  - -EINVAL for invalid parameters
>> > + */
>> > +int gso_do_segment(struct rte_mbuf *pkt,
>> > +		uint16_t pkt_hdr_offset,
>> > +		uint16_t pyld_unit_size,
>> > +		struct rte_mempool *direct_pool,
>> > +		struct rte_mempool *indirect_pool,
>> > +		struct rte_mbuf **pkts_out,
>> > +		uint16_t nb_pkts_out);
>> > +#endif
>> > diff --git a/lib/librte_gso/gso_tcp.c b/lib/librte_gso/gso_tcp.c
>> > new file mode 100644
>> > index 0000000..9d5fc30
>> > --- /dev/null
>> > +++ b/lib/librte_gso/gso_tcp.c
>> > @@ -0,0 +1,82 @@
>> > +/*-
>> > + *   BSD LICENSE
>> > + *
>> > + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
>> > + *   All rights reserved.
>> > + *
>> > + *   Redistribution and use in source and binary forms, with or without
>> > + *   modification, are permitted provided that the following conditions
>> > + *   are met:
>> > + *
>> > + *     * Redistributions of source code must retain the above copyright
>> > + *       notice, this list of conditions and the following disclaimer.
>> > + *     * Redistributions in binary form must reproduce the above
>copyright
>> > + *       notice, this list of conditions and the following disclaimer in
>> > + *       the documentation and/or other materials provided with the
>> > + *       distribution.
>> > + *     * Neither the name of Intel Corporation nor the names of its
>> > + *       contributors may be used to endorse or promote products derived
>> > + *       from this software without specific prior written permission.
>> > + *
>> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
>FOR
>> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
>> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
>INCIDENTAL,
>> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
>> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
>USE,
>> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
>ANY
>> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
>> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
>USE
>> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> > + */
>> > +
>> > +
>> > +#include <rte_ether.h>
>> > +#include <rte_ip.h>
>> > +#include <rte_tcp.h>
>> > +
>> > +#include "gso_common.h"
>> > +#include "gso_tcp.h"
>> > +
>> > +int
>> > +gso_tcp4_segment(struct rte_mbuf *pkt,
>> > +		uint16_t gso_size,
>> > +		struct rte_mempool *direct_pool,
>> > +		struct rte_mempool *indirect_pool,
>> > +		struct rte_mbuf **pkts_out,
>> > +		uint16_t nb_pkts_out)
>> > +{
>> > +	struct ether_hdr *eth_hdr;
>> > +	struct ipv4_hdr *ipv4_hdr;
>> > +	uint16_t tcp_dl;
>> > +	uint16_t pyld_unit_size;
>> > +	uint16_t hdr_offset;
>> > +	int ret = 1;
>> > +
>> > +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
>> > +	ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len);
>> > +
>> > +	/* don't process fragmented packet */
>> > +	if ((ipv4_hdr->fragment_offset &
>> > +				rte_cpu_to_be_16(IPV4_HDR_DF_MASK)) == 0)
>> > +		return ret;
>> > +
>> > +	tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) -
>> > +		pkt->l3_len - pkt->l4_len;
>> > +	/* don't process packet without data */
>> > +	if (tcp_dl == 0)
>> > +		return ret;
>> > +
>> > +	hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
>> > +	pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
>> > +
>> > +	/* segment the payload */
>> > +	ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
>> > +			indirect_pool, pkts_out, nb_pkts_out);
>> > +
>> > +	if (ret > 1)
>> > +		gso_update_pkt_headers(pkt, ret, pkts_out);
>> > +
>> > +	return ret;
>> > +}
>> > diff --git a/lib/librte_gso/gso_tcp.h b/lib/librte_gso/gso_tcp.h
>> > new file mode 100644
>> > index 0000000..f291ccb
>> > --- /dev/null
>> > +++ b/lib/librte_gso/gso_tcp.h
>> > @@ -0,0 +1,73 @@
>> > +/*-
>> > + *   BSD LICENSE
>> > + *
>> > + *   Copyright(c) 2017 Intel Corporation. All rights reserved.
>> > + *   All rights reserved.
>> > + *
>> > + *   Redistribution and use in source and binary forms, with or without
>> > + *   modification, are permitted provided that the following conditions
>> > + *   are met:
>> > + *
>> > + *     * Redistributions of source code must retain the above copyright
>> > + *       notice, this list of conditions and the following disclaimer.
>> > + *     * Redistributions in binary form must reproduce the above
>copyright
>> > + *       notice, this list of conditions and the following disclaimer in
>> > + *       the documentation and/or other materials provided with the
>> > + *       distribution.
>> > + *     * Neither the name of Intel Corporation nor the names of its
>> > + *       contributors may be used to endorse or promote products derived
>> > + *       from this software without specific prior written permission.
>> > + *
>> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
>FOR
>> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
>> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
>INCIDENTAL,
>> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
>> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
>USE,
>> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
>ANY
>> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
>> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
>USE
>> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> > + */
>> > +
>> > +#ifndef _GSO_TCP_H_
>> > +#define _GSO_TCP_H_
>> > +
>> > +#include <stdint.h>
>> > +#include <rte_mbuf.h>
>> > +
>> > +/**
>> > + * Segment an IPv4/TCP packet. This function assumes the input packet has
>> > + * correct checksums and doesn't update checksums for GSO segment.
>> > + * Furthermore, it doesn't process IP fragment packets.
>> > + *
>> > + * @param pkt
>> > + *  The packet mbuf to segment.
>> > + * @param gso_size
>> > + *  The max length of a GSO segment, measured in bytes.
>> > + * @param direct_pool
>> > + *  MBUF pool used for allocating direct buffers for output segments.
>> > + * @param indirect_pool
>> > + *  MBUF pool used for allocating indirect buffers for output segments.
>> > + * @param pkts_out
>> > + *  Pointer array, which is used to store mbuf addresses of GSO segments.
>> > + *  Caller should guarantee that 'pkts_out' is sufficiently large to
>store
>> > + *  all GSO segments.
>> > + * @param nb_pkts_out
>> > + *  The max number of items that 'pkts_out' can keep.
>> > + *
>> > + * @return
>> > + *   - The number of GSO segments on success.
>> > + *   - Return 1 if no GSO is performed.
>> > + *   - Return -ENOMEM if available memory in mempools is insufficient.
>> > + *   - Return -EINVAL for invalid parameters.
>> > + */
>> > +int gso_tcp4_segment(struct rte_mbuf *pkt,
>> > +		uint16_t gso_size,
>> > +		struct rte_mempool *direct_pool,
>> > +		struct rte_mempool *indirect_pool,
>> > +		struct rte_mbuf **pkts_out,
>> > +		uint16_t nb_pkts_out);
>> > +
>> > +#endif
>> > diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
>> > index b81afce..fac95f2 100644
>> > --- a/lib/librte_gso/rte_gso.c
>> > +++ b/lib/librte_gso/rte_gso.c
>> > @@ -31,17 +31,57 @@
>> >   *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> >   */
>> >
>> > +#include <rte_log.h>
>> > +
>> >  #include "rte_gso.h"
>> > +#include "gso_common.h"
>> > +#include "gso_tcp.h"
>> >
>> >  int
>> >  rte_gso_segment(struct rte_mbuf *pkt,
>> >  		struct rte_gso_ctx gso_ctx,
>> >  		struct rte_mbuf **pkts_out,
>> > -		uint16_t nb_pkts_out __rte_unused)
>> > +		uint16_t nb_pkts_out)
>> >  {
>> > +	struct rte_mempool *direct_pool, *indirect_pool;
>> > +	struct rte_mbuf *pkt_seg;
>> > +	uint16_t nb_segments, gso_size;
>> > +
>> >  	if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
>> >  			NULL || gso_ctx.indirect_pool == NULL)
>> >  		return -EINVAL;
>>
>> Probably we don't need to check gso_ctx values for each incoming packet.
>> If you feel it is necessary - create  new function rte_gso_ctx_check() that
>> could be performed just once per ctx.
>
>Agree. I will change it. Thanks.
>
>>
>> >
>> > -	return 1;
>> > +	if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
>> > +			gso_ctx.gso_size >= pkt->pkt_len ||
>> > +			gso_ctx.gso_size == 0)
>>
>>
>> First and third condition seems redundant.
>
>The reason to check gso_ctx.gso_types here is that we don't perform
>GSO if applications don't set RTE_GSO_TCP_IPV4 to gso_ctx.gso_types,
>even the input packet is TCP/IPv4. And if gso_ctx.gso_size is 0,
>we don't need to execute the following codes. So we still need to
>remove these two conditions?
>
>>
>> > +		return 1;
>>
>>
>> I think you forgot here:
>> pkts_out[0] = pkt;
>
>But why should we keep the input packet in the output array? Currently, if
>GSO is not performed, no packets will be kept in pkts_out[]. Applications
>can know it by the return value 1 of rte_gso_segment().
>
>>
>>
>> > +
>> > +	pkt_seg = pkt;
>> > +	gso_size = gso_ctx.gso_size;
>> > +	direct_pool = gso_ctx.direct_pool;
>> > +	indirect_pool = gso_ctx.indirect_pool;
>> > +
>> > +	/* Parse packet headers to determine how to segment 'pkt' */
>> > +	gso_parse_packet(pkt);
>>
>>
>> I don't think we need to parse packet here.
>>  Instead assume that user already filled packet_type and l2/l3/..._len
>fields correctly.
>
>Hmm, I see it. Thanks.
>
>Thanks,
>Jiayu
  
Ananyev, Konstantin Aug. 30, 2017, 9:39 a.m. UTC | #5
Hi Mark,

> >> > +
> >> > +void
> >> > +gso_parse_packet(struct rte_mbuf *pkt)
> >>
> >> There is a function rte_net_get_ptype() that supposed to provide similar
> >functionality.
> >> So we probably don't need to create a new SW parse function here, instead
> >would be better
> >> to reuse (and update if needed) an existing one.
> >> Again user already might have l2/l3/l4.../_len and packet_type setuped.
> >> So better to keep SW packet parsing out of scope of that library.
> >
> >Hmm, I know we have discussed this design choice in the GRO library, and I
> >also think it's
> >better to reuse these values.
> >
> >But from the perspective of OVS, it may add extra overhead, since OVS doesn't
> >parse every
> >packet originally. Maybe @Mark can give us more inputs from the view of OVS.
> 
> Hi Jiayu, Konstantin
> 
> For GSO, the application needs to know:
> - the packet type (as it only currently supports TCP/IPv4, VxLAN, GRE packets)
> - the l2/3/4_lens, etc. (in order to replicate the original packet's headers across outgoing segments)
> 
> For this, we can use the rte_net_get_ptype function, as per Konstantin's suggestion, as it provides both - thanks Konstantin!
> 
> WRT the extra overhead in OvS: TSO is the defacto standard, and GSO is provided purely as a fallback option. As such, and since the
> additional packet parsing is a necessity in order to facilitate GSO, the additional overhead is IMO acceptable.

As I remember, for TSO in DPDK user still have to provide l2/l3/l4_len and mss information to the PMD.
So unless user knows these value straightway (user creates a packet himself) some packet processing will be unavailable anyway.
Konstantin
> 
> Thanks,
> Mark
>
  
Ananyev, Konstantin Aug. 30, 2017, 9:59 a.m. UTC | #6
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ananyev, Konstantin
> Sent: Wednesday, August 30, 2017 10:39 AM
> To: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
> Cc: dev@dpdk.org; Tan, Jianfeng <jianfeng.tan@intel.com>
> Subject: Re: [dpdk-dev] [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> 
> Hi Mark,
> 
> > >> > +
> > >> > +void
> > >> > +gso_parse_packet(struct rte_mbuf *pkt)
> > >>
> > >> There is a function rte_net_get_ptype() that supposed to provide similar
> > >functionality.
> > >> So we probably don't need to create a new SW parse function here, instead
> > >would be better
> > >> to reuse (and update if needed) an existing one.
> > >> Again user already might have l2/l3/l4.../_len and packet_type setuped.
> > >> So better to keep SW packet parsing out of scope of that library.
> > >
> > >Hmm, I know we have discussed this design choice in the GRO library, and I
> > >also think it's
> > >better to reuse these values.
> > >
> > >But from the perspective of OVS, it may add extra overhead, since OVS doesn't
> > >parse every
> > >packet originally. Maybe @Mark can give us more inputs from the view of OVS.
> >
> > Hi Jiayu, Konstantin
> >
> > For GSO, the application needs to know:
> > - the packet type (as it only currently supports TCP/IPv4, VxLAN, GRE packets)
> > - the l2/3/4_lens, etc. (in order to replicate the original packet's headers across outgoing segments)
> >
> > For this, we can use the rte_net_get_ptype function, as per Konstantin's suggestion, as it provides both - thanks Konstantin!
> >
> > WRT the extra overhead in OvS: TSO is the defacto standard, and GSO is provided purely as a fallback option. As such, and since the
> > additional packet parsing is a necessity in order to facilitate GSO, the additional overhead is IMO acceptable.
> 
> As I remember, for TSO in DPDK user still have to provide l2/l3/l4_len and mss information to the PMD.
> So unless user knows these value straightway (user creates a packet himself) some packet processing will be unavailable anyway.
> Konstantin

s/unavailable/unavoidable/
sorry for bad typing.
Konstantin

> >
> > Thanks,
> > Mark
> >
  
Mark Kavanagh Aug. 30, 2017, 1:27 p.m. UTC | #7
>From: Ananyev, Konstantin
>Sent: Wednesday, August 30, 2017 10:59 AM
>To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Kavanagh, Mark B
><mark.b.kavanagh@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
>Cc: dev@dpdk.org; Tan, Jianfeng <jianfeng.tan@intel.com>
>Subject: RE: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
>
>
>
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ananyev, Konstantin
>> Sent: Wednesday, August 30, 2017 10:39 AM
>> To: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Hu, Jiayu
><jiayu.hu@intel.com>
>> Cc: dev@dpdk.org; Tan, Jianfeng <jianfeng.tan@intel.com>
>> Subject: Re: [dpdk-dev] [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
>>
>> Hi Mark,
>>
>> > >> > +
>> > >> > +void
>> > >> > +gso_parse_packet(struct rte_mbuf *pkt)
>> > >>
>> > >> There is a function rte_net_get_ptype() that supposed to provide
>similar
>> > >functionality.
>> > >> So we probably don't need to create a new SW parse function here,
>instead
>> > >would be better
>> > >> to reuse (and update if needed) an existing one.
>> > >> Again user already might have l2/l3/l4.../_len and packet_type setuped.
>> > >> So better to keep SW packet parsing out of scope of that library.
>> > >
>> > >Hmm, I know we have discussed this design choice in the GRO library, and
>I
>> > >also think it's
>> > >better to reuse these values.
>> > >
>> > >But from the perspective of OVS, it may add extra overhead, since OVS
>doesn't
>> > >parse every
>> > >packet originally. Maybe @Mark can give us more inputs from the view of
>OVS.
>> >
>> > Hi Jiayu, Konstantin
>> >
>> > For GSO, the application needs to know:
>> > - the packet type (as it only currently supports TCP/IPv4, VxLAN, GRE
>packets)
>> > - the l2/3/4_lens, etc. (in order to replicate the original packet's
>headers across outgoing segments)
>> >
>> > For this, we can use the rte_net_get_ptype function, as per Konstantin's
>suggestion, as it provides both - thanks Konstantin!
>> >
>> > WRT the extra overhead in OvS: TSO is the defacto standard, and GSO is
>provided purely as a fallback option. As such, and since the
>> > additional packet parsing is a necessity in order to facilitate GSO, the
>additional overhead is IMO acceptable.
>>
>> As I remember, for TSO in DPDK user still have to provide l2/l3/l4_len and
>mss information to the PMD.

Yes, that's correct. 

>> So unless user knows these value straightway (user creates a packet himself)
>some packet processing will be unavailable anyway.

That's correct also. Currently, packets that originate in a VM, and which have been marked for TSO, have the l2_len, etc. fields populated by the 'parse_ethernet' function, called as part of the call stack of the rte_vhost_dequeue_burst function, so that particular overhead is already implicit in the TSO case.

>> Konstantin
>
>s/unavailable/unavoidable/
>sorry for bad typing.
>Konstantin
>
>> >
>> > Thanks,
>> > Mark
>> >
  
Hu, Jiayu Sept. 4, 2017, 3:31 a.m. UTC | #8
Hi Konstantin,

About the IP identifier, I check the linux codes and have some feedbacks inline.

On Wed, Aug 30, 2017 at 09:38:33AM +0800, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: Hu, Jiayu
> > Sent: Thursday, August 24, 2017 3:16 PM
> > To: dev@dpdk.org
> > Cc: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng
> > <jianfeng.tan@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
> > Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> > 
> > This patch adds GSO support for TCP/IPv4 packets. Supported packets
> > may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
> > packets have correct checksums, and doesn't update checksums for output
> > packets (the responsibility for this lies with the application).
> > Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
> > 
> > TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
> > MBUF, to organize an output packet. Note that we refer to these two
> > chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
> > header, while the indirect mbuf simply points to a location within the
> > original packet's payload. Consequently, use of the GSO library requires
> > multi-segment MBUF support in the TX functions of the NIC driver.
> > 
> > If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
> > result, when all of its GSOed segments are freed, the packet is freed
> > automatically.
> > 
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> > ---
> > +void
> > +gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
> > +		struct rte_mbuf **out_segments)
> > +{
> > +	struct ipv4_hdr *ipv4_hdr;
> > +	struct tcp_hdr *tcp_hdr;
> > +	struct rte_mbuf *seg;
> > +	uint32_t sent_seq;
> > +	uint16_t offset, i;
> > +	uint16_t tail_seg_idx = nb_segments - 1, id;
> > +
> > +	switch (pkt->packet_type) {
> > +	case ETHER_VLAN_IPv4_TCP_PKT:
> > +	case ETHER_IPv4_TCP_PKT:
> 
> Might be worth to put code below in a separate function:
> update_inner_tcp_hdr(..) or so.
> Then you can reuse it for tunneled cases too.
> 
> > +		ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) 
> > +				pkt->l2_len);
> > +		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
> > +		id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
> > +		sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
> > +
> > +		for (i = 0; i < nb_segments; i++) {
> > +			seg = out_segments[i];
> > +
> > +			offset = seg->l2_len;
> > +			update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
> > +					offset, seg->pkt_len, id);
> > +			id++;
> 
> Who would be responsible to make sure that we wouldn't have consecutive packets with the IPV4 id?
> Would be the upper layer that forms the packet or gso library or ...?

Linux supports two kinds of IP identifier: fixed identifier and incremental identifier, and
which one to use depends on upper protocol modules. Specifically, if the protocol module
wants fixed identifiers, it will set SKB_GSO_TCP_FIXEDID to skb->gso_type, and then
inet_gso_segment() will keep identifiers the same. Otherwise, all segments will have
incremental identifiers. The reason for this design is that some protocols may choose fixed
IP identifiers, like TCP (from RFC791). This design also shows that linux ignores the issue
of repeated IP identifiers.

From the perspective of DPDK, we need to solve two problems. One is if ignore the issue of
repeated IP identifiers. The other is if the GSO library provides an interface to upper
applications to enable them to choose fixed or incremental identifiers, or simply uses
incremental IP identifiers.

Do you have any suggestions?

Thanks,
Jiayu

> 
> > +
> > +			offset += seg->l3_len;
> > +			update_tcp_header(rte_pktmbuf_mtod(seg, char *),
> > +					offset, sent_seq, i < tail_seg_idx);
> > +			sent_seq += seg->next->data_len;
> > +		}
> > +		break;
> > +	}
> > +}
> > --
> > 2.7.4
  
Ananyev, Konstantin Sept. 4, 2017, 9:54 a.m. UTC | #9
Hi Jiayu,

> -----Original Message-----
> From: Hu, Jiayu
> Sent: Monday, September 4, 2017 4:32 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>
> Subject: Re: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> 
> Hi Konstantin,
> 
> About the IP identifier, I check the linux codes and have some feedbacks inline.
> 
> On Wed, Aug 30, 2017 at 09:38:33AM +0800, Ananyev, Konstantin wrote:
> >
> >
> > > -----Original Message-----
> > > From: Hu, Jiayu
> > > Sent: Thursday, August 24, 2017 3:16 PM
> > > To: dev@dpdk.org
> > > Cc: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng
> > > <jianfeng.tan@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
> > > Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> > >
> > > This patch adds GSO support for TCP/IPv4 packets. Supported packets
> > > may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
> > > packets have correct checksums, and doesn't update checksums for output
> > > packets (the responsibility for this lies with the application).
> > > Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
> > >
> > > TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
> > > MBUF, to organize an output packet. Note that we refer to these two
> > > chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
> > > header, while the indirect mbuf simply points to a location within the
> > > original packet's payload. Consequently, use of the GSO library requires
> > > multi-segment MBUF support in the TX functions of the NIC driver.
> > >
> > > If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
> > > result, when all of its GSOed segments are freed, the packet is freed
> > > automatically.
> > >
> > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> > > ---
> > > +void
> > > +gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
> > > +		struct rte_mbuf **out_segments)
> > > +{
> > > +	struct ipv4_hdr *ipv4_hdr;
> > > +	struct tcp_hdr *tcp_hdr;
> > > +	struct rte_mbuf *seg;
> > > +	uint32_t sent_seq;
> > > +	uint16_t offset, i;
> > > +	uint16_t tail_seg_idx = nb_segments - 1, id;
> > > +
> > > +	switch (pkt->packet_type) {
> > > +	case ETHER_VLAN_IPv4_TCP_PKT:
> > > +	case ETHER_IPv4_TCP_PKT:
> >
> > Might be worth to put code below in a separate function:
> > update_inner_tcp_hdr(..) or so.
> > Then you can reuse it for tunneled cases too.
> >
> > > +		ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
> > > +				pkt->l2_len);
> > > +		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
> > > +		id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
> > > +		sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
> > > +
> > > +		for (i = 0; i < nb_segments; i++) {
> > > +			seg = out_segments[i];
> > > +
> > > +			offset = seg->l2_len;
> > > +			update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
> > > +					offset, seg->pkt_len, id);
> > > +			id++;
> >
> > Who would be responsible to make sure that we wouldn't have consecutive packets with the IPV4 id?
> > Would be the upper layer that forms the packet or gso library or ...?
> 
> Linux supports two kinds of IP identifier: fixed identifier and incremental identifier, and
> which one to use depends on upper protocol modules. Specifically, if the protocol module
> wants fixed identifiers, it will set SKB_GSO_TCP_FIXEDID to skb->gso_type, and then
> inet_gso_segment() will keep identifiers the same. Otherwise, all segments will have
> incremental identifiers. The reason for this design is that some protocols may choose fixed
> IP identifiers, like TCP (from RFC791). This design also shows that linux ignores the issue
> of repeated IP identifiers.
> 
> From the perspective of DPDK, we need to solve two problems. One is if ignore the issue of
> repeated IP identifiers. The other is if the GSO library provides an interface to upper
> applications to enable them to choose fixed or incremental identifiers, or simply uses
> incremental IP identifiers.
> 
> Do you have any suggestions?


Do the same as Linux?
I.E. add some flag RRE_GSO_IPID_FIXED (or so) into gso_ctx?
Konstantin

> 
> Thanks,
> Jiayu
> 
> >
> > > +
> > > +			offset += seg->l3_len;
> > > +			update_tcp_header(rte_pktmbuf_mtod(seg, char *),
> > > +					offset, sent_seq, i < tail_seg_idx);
> > > +			sent_seq += seg->next->data_len;
> > > +		}
> > > +		break;
> > > +	}
> > > +}
> > > --
> > > 2.7.4
  
Hu, Jiayu Sept. 5, 2017, 1:09 a.m. UTC | #10
Hi Konstantin,

> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Monday, September 4, 2017 5:55 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>
> Cc: dev@dpdk.org; Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Tan,
> Jianfeng <jianfeng.tan@intel.com>
> Subject: RE: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> 
> Hi Jiayu,
> 
> > -----Original Message-----
> > From: Hu, Jiayu
> > Sent: Monday, September 4, 2017 4:32 AM
> > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Cc: dev@dpdk.org; Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Tan,
> Jianfeng <jianfeng.tan@intel.com>
> > Subject: Re: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> >
> > Hi Konstantin,
> >
> > About the IP identifier, I check the linux codes and have some feedbacks
> inline.
> >
> > On Wed, Aug 30, 2017 at 09:38:33AM +0800, Ananyev, Konstantin wrote:
> > >
> > >
> > > > -----Original Message-----
> > > > From: Hu, Jiayu
> > > > Sent: Thursday, August 24, 2017 3:16 PM
> > > > To: dev@dpdk.org
> > > > Cc: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; Ananyev,
> Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng
> > > > <jianfeng.tan@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>
> > > > Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
> > > >
> > > > This patch adds GSO support for TCP/IPv4 packets. Supported packets
> > > > may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
> > > > packets have correct checksums, and doesn't update checksums for
> output
> > > > packets (the responsibility for this lies with the application).
> > > > Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
> > > >
> > > > TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one
> indrect
> > > > MBUF, to organize an output packet. Note that we refer to these two
> > > > chained MBUFs as a two-segment MBUF. The direct MBUF stores the
> packet
> > > > header, while the indirect mbuf simply points to a location within the
> > > > original packet's payload. Consequently, use of the GSO library requires
> > > > multi-segment MBUF support in the TX functions of the NIC driver.
> > > >
> > > > If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
> > > > result, when all of its GSOed segments are freed, the packet is freed
> > > > automatically.
> > > >
> > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> > > > ---
> > > > +void
> > > > +gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
> > > > +		struct rte_mbuf **out_segments)
> > > > +{
> > > > +	struct ipv4_hdr *ipv4_hdr;
> > > > +	struct tcp_hdr *tcp_hdr;
> > > > +	struct rte_mbuf *seg;
> > > > +	uint32_t sent_seq;
> > > > +	uint16_t offset, i;
> > > > +	uint16_t tail_seg_idx = nb_segments - 1, id;
> > > > +
> > > > +	switch (pkt->packet_type) {
> > > > +	case ETHER_VLAN_IPv4_TCP_PKT:
> > > > +	case ETHER_IPv4_TCP_PKT:
> > >
> > > Might be worth to put code below in a separate function:
> > > update_inner_tcp_hdr(..) or so.
> > > Then you can reuse it for tunneled cases too.
> > >
> > > > +		ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
> > > > +				pkt->l2_len);
> > > > +		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
> > > > +		id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
> > > > +		sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
> > > > +
> > > > +		for (i = 0; i < nb_segments; i++) {
> > > > +			seg = out_segments[i];
> > > > +
> > > > +			offset = seg->l2_len;
> > > > +			update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
> > > > +					offset, seg->pkt_len, id);
> > > > +			id++;
> > >
> > > Who would be responsible to make sure that we wouldn't have
> consecutive packets with the IPV4 id?
> > > Would be the upper layer that forms the packet or gso library or ...?
> >
> > Linux supports two kinds of IP identifier: fixed identifier and incremental
> identifier, and
> > which one to use depends on upper protocol modules. Specifically, if the
> protocol module
> > wants fixed identifiers, it will set SKB_GSO_TCP_FIXEDID to skb->gso_type,
> and then
> > inet_gso_segment() will keep identifiers the same. Otherwise, all segments
> will have
> > incremental identifiers. The reason for this design is that some protocols
> may choose fixed
> > IP identifiers, like TCP (from RFC791). This design also shows that linux
> ignores the issue
> > of repeated IP identifiers.
> >
> > From the perspective of DPDK, we need to solve two problems. One is if
> ignore the issue of
> > repeated IP identifiers. The other is if the GSO library provides an interface
> to upper
> > applications to enable them to choose fixed or incremental identifiers, or
> simply uses
> > incremental IP identifiers.
> >
> > Do you have any suggestions?
> 
> 
> Do the same as Linux?
> I.E. add some flag RRE_GSO_IPID_FIXED (or so) into gso_ctx?

OK, I see. We can do that.

In the GRO library, we check if the IP identifiers are incremental compulsorily. If we
enable fixed IP identifier in GSO, it seems we also need to change the GRO library.
I mean ignore IP identifier when merge packets, and don't update the IP identifier
for the merged packet. What do you think of it?

Thanks,
Jiayu

> Konstantin
> 
> >
> > Thanks,
> > Jiayu
> >
> > >
> > > > +
> > > > +			offset += seg->l3_len;
> > > > +			update_tcp_header(rte_pktmbuf_mtod(seg, char *),
> > > > +					offset, sent_seq, i < tail_seg_idx);
> > > > +			sent_seq += seg->next->data_len;
> > > > +		}
> > > > +		break;
> > > > +	}
> > > > +}
> > > > --
> > > > 2.7.4
  
Ananyev, Konstantin Sept. 11, 2017, 1:04 p.m. UTC | #11
Hi Jiayu,

> > > Linux supports two kinds of IP identifier: fixed identifier and incremental
> > identifier, and
> > > which one to use depends on upper protocol modules. Specifically, if the
> > protocol module
> > > wants fixed identifiers, it will set SKB_GSO_TCP_FIXEDID to skb->gso_type,
> > and then
> > > inet_gso_segment() will keep identifiers the same. Otherwise, all segments
> > will have
> > > incremental identifiers. The reason for this design is that some protocols
> > may choose fixed
> > > IP identifiers, like TCP (from RFC791). This design also shows that linux
> > ignores the issue
> > > of repeated IP identifiers.
> > >
> > > From the perspective of DPDK, we need to solve two problems. One is if
> > ignore the issue of
> > > repeated IP identifiers. The other is if the GSO library provides an interface
> > to upper
> > > applications to enable them to choose fixed or incremental identifiers, or
> > simply uses
> > > incremental IP identifiers.
> > >
> > > Do you have any suggestions?
> >
> >
> > Do the same as Linux?
> > I.E. add some flag RRE_GSO_IPID_FIXED (or so) into gso_ctx?
> 
> OK, I see. We can do that.
> 
> In the GRO library, we check if the IP identifiers are incremental compulsorily. If we
> enable fixed IP identifier in GSO, it seems we also need to change the GRO library.
> I mean ignore IP identifier when merge packets, and don't update the IP identifier
> for the merged packet. What do you think of it?

I suppose we can, if there is a use-case for it.
Konstantin
>
  

Patch

diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@  extern struct rte_logs rte_logs;
 #define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
 #define RTE_LOGTYPE_EFD       18 /**< Log related to EFD. */
 #define RTE_LOGTYPE_EVENTDEV  19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO       20 /**< Log related to GSO. */
 
 /* these log types can be used in an application */
 #define RTE_LOGTYPE_USER1     24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..0f8e38f 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@  LIBABIVER := 1
 
 #source files
 SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c
 
 # install this header file
 SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..2b54fbd
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,270 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+		uint16_t pkt_hdr_offset)
+{
+	/* copy mbuf metadata */
+	hdr_segment->nb_segs = 1;
+	hdr_segment->port = pkt->port;
+	hdr_segment->ol_flags = pkt->ol_flags;
+	hdr_segment->packet_type = pkt->packet_type;
+	hdr_segment->pkt_len = pkt_hdr_offset;
+	hdr_segment->data_len = pkt_hdr_offset;
+	hdr_segment->tx_offload = pkt->tx_offload;
+	/* copy packet header */
+	rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+			rte_pktmbuf_mtod(pkt, char *),
+			pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+	uint16_t i;
+
+	for (i = 0; i < nb_pkts; i++) {
+		rte_pktmbuf_detach(pkts[i]->next);
+		rte_pktmbuf_free(pkts[i]);
+		pkts[i] = NULL;
+	}
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+		uint16_t pkt_hdr_offset,
+		uint16_t pyld_unit_size,
+		struct rte_mempool *direct_pool,
+		struct rte_mempool *indirect_pool,
+		struct rte_mbuf **pkts_out,
+		uint16_t nb_pkts_out)
+{
+	struct rte_mbuf *pkt_in;
+	struct rte_mbuf *hdr_segment, *pyld_segment;
+	uint32_t pkt_in_pyld_off;
+	uint16_t pkt_in_segment_len, pkt_out_segment_len;
+	uint16_t nb_segs;
+	bool pkt_in_segment_processed;
+
+	pkt_in_pyld_off = pkt->data_off + pkt_hdr_offset;
+	pkt_in = pkt;
+	nb_segs = 0;
+
+	while (pkt_in) {
+		pkt_in_segment_processed = false;
+		pkt_in_segment_len = pkt_in->data_off + pkt_in->data_len;
+
+		while (!pkt_in_segment_processed) {
+			if (unlikely(nb_segs >= nb_pkts_out)) {
+				free_gso_segment(pkts_out, nb_segs);
+				return -EINVAL;
+			}
+
+			/* allocate direct mbuf */
+			hdr_segment = rte_pktmbuf_alloc(direct_pool);
+			if (unlikely(hdr_segment == NULL)) {
+				free_gso_segment(pkts_out, nb_segs);
+				return -ENOMEM;
+			}
+
+			/* allocate indirect mbuf */
+			pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+			if (unlikely(pyld_segment == NULL)) {
+				rte_pktmbuf_free(hdr_segment);
+				free_gso_segment(pkts_out, nb_segs);
+				return -ENOMEM;
+			}
+
+			/* copy packet header */
+			hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+			/* attach payload mbuf to current packet segment */
+			rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+			hdr_segment->next = pyld_segment;
+			pkts_out[nb_segs++] = hdr_segment;
+
+			/* calculate payload length */
+			pkt_out_segment_len = pyld_unit_size;
+			if (pkt_in_pyld_off + pkt_out_segment_len >
+					pkt_in_segment_len) {
+				pkt_out_segment_len = pkt_in_segment_len -
+					pkt_in_pyld_off;
+			}
+
+			/* update payload segment */
+			pyld_segment->data_off = pkt_in_pyld_off;
+			pyld_segment->data_len = pkt_out_segment_len;
+
+			/* update header segment */
+			hdr_segment->pkt_len += pyld_segment->data_len;
+			hdr_segment->nb_segs++;
+
+			/* update pkt_in_pyld_off */
+			pkt_in_pyld_off += pkt_out_segment_len;
+			if (pkt_in_pyld_off == pkt_in_segment_len)
+				pkt_in_segment_processed = true;
+		}
+
+		/* 'pkt_in' may contain numerous segments */
+		pkt_in = pkt_in->next;
+		if (pkt_in != NULL)
+			pkt_in_pyld_off = pkt_in->data_off;
+	}
+	return nb_segs;
+}
+
+static inline void
+parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
+{
+	struct tcp_hdr *tcp_hdr;
+
+	switch (ipv4_hdr->next_proto_id) {
+	case IPPROTO_TCP:
+		pkt->packet_type |= RTE_PTYPE_L4_TCP;
+		pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
+		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+		pkt->l4_len = TCP_HDR_LEN(tcp_hdr);
+		break;
+	}
+}
+
+static inline void
+parse_ethernet(struct ether_hdr *eth_hdr, struct rte_mbuf *pkt)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct vlan_hdr *vlan_hdr;
+	uint16_t ethertype;
+
+	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+	if (ethertype == ETHER_TYPE_VLAN) {
+		vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+		pkt->l2_len = sizeof(struct vlan_hdr);
+		pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+	}
+
+	switch (ethertype) {
+	case ETHER_TYPE_IPv4:
+		if (IS_VLAN_PKT(pkt)) {
+			pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+		} else {
+			pkt->packet_type |= RTE_PTYPE_L2_ETHER;
+			pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+		}
+		pkt->l2_len += sizeof(struct ether_hdr);
+		ipv4_hdr = (struct ipv4_hdr *) ((char *)eth_hdr +
+				pkt->l2_len);
+		parse_ipv4(ipv4_hdr, pkt);
+		break;
+	}
+}
+
+void
+gso_parse_packet(struct rte_mbuf *pkt)
+{
+	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	pkt->packet_type = pkt->tx_offload = 0;
+	parse_ethernet(eth_hdr, pkt);
+}
+
+static inline void
+update_ipv4_header(char *base, uint16_t offset, uint16_t length, uint16_t id)
+{
+	struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)(base + offset);
+
+	ipv4_hdr->total_length = rte_cpu_to_be_16(length - offset);
+	ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+}
+
+static inline void
+update_tcp_header(char *base, uint16_t offset, uint32_t sent_seq,
+	uint8_t non_tail)
+{
+	struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)(base + offset);
+
+	tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+	/* clean FIN and PSH for non-tail segments */
+	if (non_tail)
+		tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK | TCP_HDR_FIN_MASK));
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+		struct rte_mbuf **out_segments)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct tcp_hdr *tcp_hdr;
+	struct rte_mbuf *seg;
+	uint32_t sent_seq;
+	uint16_t offset, i;
+	uint16_t tail_seg_idx = nb_segments - 1, id;
+
+	switch (pkt->packet_type) {
+	case ETHER_VLAN_IPv4_TCP_PKT:
+	case ETHER_IPv4_TCP_PKT:
+		ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+				pkt->l2_len);
+		tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+		id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+		sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+		for (i = 0; i < nb_segments; i++) {
+			seg = out_segments[i];
+
+			offset = seg->l2_len;
+			update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+					offset, seg->pkt_len, id);
+			id++;
+
+			offset += seg->l3_len;
+			update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+					offset, sent_seq, i < tail_seg_idx);
+			sent_seq += seg->next->data_len;
+		}
+		break;
+	}
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..d750041
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,120 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+#define IPv4_HDR_LEN(iph) ((iph->version_ihl & 0x0f) * 4)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+#define TCP_HDR_LEN(tcph) ((tcph->data_off & 0xf0) >> 2)
+
+#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
+/* Supported packet types */
+/* TCP/IPv4 packet. */
+#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
+
+/* TCP/IPv4 packet with VLAN tag. */
+#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+		RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
+
+#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == \
+		RTE_PTYPE_L2_ETHER_VLAN)
+
+/**
+ * Internal function which parses a packet, setting outer_l2/l3_len and
+ * l2/l3/l4_len and packet_type.
+ *
+ * @param pkt
+ *  Packet to parse.
+ */
+void gso_parse_packet(struct rte_mbuf *pkt);
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * @param pkt
+ *  The original packet.
+ * @param nb_segments
+ *  The number of GSO segments into which pkt was split.
+ * @param out_segements
+ *  Pointer array used for storing mbuf addresses for GSO segments.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+		struct rte_mbuf **out_segments);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment mbuf,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * @param pkt
+ *  Packet to segment.
+ * @param pkt_hdr_offset
+ *  Packet header offset, measured in byte.
+ * @param pyld_unit_size
+ *  The max payload length of a GSO segment.
+ * @param direct_pool
+ *  MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ *  MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ *  Pointer array used to keep the mbuf addresses of output segments.
+ * @param nb_pkts_out
+ *  The max number of items that pkts_out can keep.
+ *
+ * @return
+ *  - The number of segments created in the event of success.
+ *  - If no GSO is performed, return 1.
+ *  - If available memory in mempools is insufficient, return -ENOMEM.
+ *  - -EINVAL for invalid parameters
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+		uint16_t pkt_hdr_offset,
+		uint16_t pyld_unit_size,
+		struct rte_mempool *direct_pool,
+		struct rte_mempool *indirect_pool,
+		struct rte_mbuf **pkts_out,
+		uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp.c b/lib/librte_gso/gso_tcp.c
new file mode 100644
index 0000000..9d5fc30
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.c
@@ -0,0 +1,82 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+#include "gso_tcp.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+		uint16_t gso_size,
+		struct rte_mempool *direct_pool,
+		struct rte_mempool *indirect_pool,
+		struct rte_mbuf **pkts_out,
+		uint16_t nb_pkts_out)
+{
+	struct ether_hdr *eth_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	uint16_t tcp_dl;
+	uint16_t pyld_unit_size;
+	uint16_t hdr_offset;
+	int ret = 1;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len);
+
+	/* don't process fragmented packet */
+	if ((ipv4_hdr->fragment_offset &
+				rte_cpu_to_be_16(IPV4_HDR_DF_MASK)) == 0)
+		return ret;
+
+	tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) -
+		pkt->l3_len - pkt->l4_len;
+	/* don't process packet without data */
+	if (tcp_dl == 0)
+		return ret;
+
+	hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+	pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+	/* segment the payload */
+	ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+			indirect_pool, pkts_out, nb_pkts_out);
+
+	if (ret > 1)
+		gso_update_pkt_headers(pkt, ret, pkts_out);
+
+	return ret;
+}
diff --git a/lib/librte_gso/gso_tcp.h b/lib/librte_gso/gso_tcp.h
new file mode 100644
index 0000000..f291ccb
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.h
@@ -0,0 +1,73 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP_H_
+#define _GSO_TCP_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * @param pkt
+ *  The packet mbuf to segment.
+ * @param gso_size
+ *  The max length of a GSO segment, measured in bytes.
+ * @param direct_pool
+ *  MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ *  MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ *  Pointer array, which is used to store mbuf addresses of GSO segments.
+ *  Caller should guarantee that 'pkts_out' is sufficiently large to store
+ *  all GSO segments.
+ * @param nb_pkts_out
+ *  The max number of items that 'pkts_out' can keep.
+ *
+ * @return
+ *   - The number of GSO segments on success.
+ *   - Return 1 if no GSO is performed.
+ *   - Return -ENOMEM if available memory in mempools is insufficient.
+ *   - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+		uint16_t gso_size,
+		struct rte_mempool *direct_pool,
+		struct rte_mempool *indirect_pool,
+		struct rte_mbuf **pkts_out,
+		uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index b81afce..fac95f2 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -31,17 +31,57 @@ 
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <rte_log.h>
+
 #include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp.h"
 
 int
 rte_gso_segment(struct rte_mbuf *pkt,
 		struct rte_gso_ctx gso_ctx,
 		struct rte_mbuf **pkts_out,
-		uint16_t nb_pkts_out __rte_unused)
+		uint16_t nb_pkts_out)
 {
+	struct rte_mempool *direct_pool, *indirect_pool;
+	struct rte_mbuf *pkt_seg;
+	uint16_t nb_segments, gso_size;
+
 	if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
 			NULL || gso_ctx.indirect_pool == NULL)
 		return -EINVAL;
 
-	return 1;
+	if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
+			gso_ctx.gso_size >= pkt->pkt_len ||
+			gso_ctx.gso_size == 0)
+		return 1;
+
+	pkt_seg = pkt;
+	gso_size = gso_ctx.gso_size;
+	direct_pool = gso_ctx.direct_pool;
+	indirect_pool = gso_ctx.indirect_pool;
+
+	/* Parse packet headers to determine how to segment 'pkt' */
+	gso_parse_packet(pkt);
+
+	switch (pkt->packet_type) {
+	case ETHER_VLAN_IPv4_TCP_PKT:
+	case ETHER_IPv4_TCP_PKT:
+		nb_segments = gso_tcp4_segment(pkt, gso_size,
+				direct_pool, indirect_pool,
+				pkts_out, nb_pkts_out);
+		break;
+	default:
+		RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
+		nb_segments = 1;
+	}
+
+	if (nb_segments > 1) {
+		while (pkt_seg) {
+			rte_mbuf_refcnt_update(pkt_seg, -1);
+			pkt_seg = pkt_seg->next;
+		}
+	}
+
+	return nb_segments;
 }
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
index 5a8389a..77853fa 100644
--- a/lib/librte_gso/rte_gso.h
+++ b/lib/librte_gso/rte_gso.h
@@ -46,6 +46,9 @@  extern "C" {
 #include <stdint.h>
 #include <rte_mbuf.h>
 
+#define RTE_GSO_TCP_IPV4 (1ULL << 0)
+/**< GSO flag for TCP/IPv4 packets (containing optional VLAN tag) */
+
 /**
  * GSO context structure.
  */