[v2,17/17] net/hinic: optimize tx&rx performance

Message ID 8fa4210f9ba33fe2db2a66f0c16fd01b1c7a57f5.1569421287.git.cloud.wangxiaoyun@huawei.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series Add advanced features for Huawei hinic pmd |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Wangxiaoyun (Cloud) Sept. 25, 2019, 2:30 p.m. UTC
  This patch optimizes receive packets performance
in arm platform.

Signed-off-by: Xiaoyun wang <cloud.wangxiaoyun@huawei.com>
---
 drivers/net/hinic/hinic_pmd_rx.c | 17 +++++++++++++++++
 drivers/net/hinic/hinic_pmd_rx.h | 11 +++++++++++
 2 files changed, 28 insertions(+)
  

Comments

Gavin Hu Sept. 27, 2019, 2:08 a.m. UTC | #1
Hi Xiaoyun,

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Xiaoyun wang
> Sent: Wednesday, September 25, 2019 10:31 PM
> To: ferruh.yigit@intel.com
> Cc: dev@dpdk.org; xuanziyang2@huawei.com; shahar.belkar@huawei.com;
> luoxianjun@huawei.com; tanya.brokhman@huawei.com;
> zhouguoyang@huawei.com; wulike1@huawei.com; Xiaoyun wang
> <cloud.wangxiaoyun@huawei.com>
> Subject: [dpdk-dev] [PATCH v2 17/17] net/hinic: optimize tx&rx
> performance
> 
> This patch optimizes receive packets performance
> in arm platform.
> 
> Signed-off-by: Xiaoyun wang <cloud.wangxiaoyun@huawei.com>
> ---
>  drivers/net/hinic/hinic_pmd_rx.c | 17 +++++++++++++++++
>  drivers/net/hinic/hinic_pmd_rx.h | 11 +++++++++++
>  2 files changed, 28 insertions(+)
> 
> diff --git a/drivers/net/hinic/hinic_pmd_rx.c
> b/drivers/net/hinic/hinic_pmd_rx.c
> index 37b4f5c..94071ee 100644
> --- a/drivers/net/hinic/hinic_pmd_rx.c
> +++ b/drivers/net/hinic/hinic_pmd_rx.c
> @@ -950,6 +950,19 @@ void hinic_rx_alloc_pkts(struct hinic_rxq *rxq)
>  	}
>  }
> 
> +#if defined(__ARM64_NEON__)
No NEON intrinsics used, maybe RTE_ARCH_ARM64 is better. 
In the following line __rte_always_inline is commonly used in DPDK, the effect is same.
/Gavin

> +static inline uint32_t __attribute__((always_inline))
> +hinic_read_cqe_status(uintptr_t addr)
> +{
> +	uint32_t val;
> +
> +	asm volatile("ldar %x[val], [%x[addr]]"
> +		: [val] "=r" (val)
> +		: [addr] "r" (addr));
> +	return val;
> +}
> +#endif
I understand your intention is the reading of the status is observed before the following reads.
This can be fulfilled by __atomic_load_n(...) with __ATOMIC_ACQUIRE semantics. 
This C11 way applies to all the arches, and you don't need the differentiation of arches. 
/Gavin
> +
>  u16 hinic_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, u16
> nb_pkts)
>  {
>  	struct rte_mbuf *rxm;
> @@ -972,7 +985,11 @@ u16 hinic_recv_pkts(void *rx_queue, struct
> rte_mbuf **rx_pkts, u16 nb_pkts)
>  	while (pkts < nb_pkts) {
>  		 /* 2. current ci is done */
>  		rx_cqe = &rxq->rx_cqe[sw_ci];
> +#if defined(__X86_64_SSE__)
>  		status = rx_cqe->status;
> +#elif defined(__ARM64_NEON__)
> +		status = hinic_read_cqe_status((uintptr_t)&rxq-
> >rx_cqe[sw_ci]);
> +#endif
>  		if (!HINIC_GET_RX_DONE_BE(status))
>  			break;
> 
> diff --git a/drivers/net/hinic/hinic_pmd_rx.h
> b/drivers/net/hinic/hinic_pmd_rx.h
> index fe2735b..fa27e91 100644
> --- a/drivers/net/hinic/hinic_pmd_rx.h
> +++ b/drivers/net/hinic/hinic_pmd_rx.h
> @@ -28,6 +28,7 @@ struct hinic_rq_ctrl {
>  	u32	ctrl_fmt;
>  };
> 
> +#if defined(__X86_64_SSE__)
>  struct hinic_rq_cqe {
>  	u32 status;
>  	u32 vlan_len;
> @@ -36,6 +37,16 @@ struct hinic_rq_cqe {
> 
>  	u32 rsvd[4];
>  };
> +#elif defined(__ARM64_NEON__)
> +struct hinic_rq_cqe {
> +	u32 status;
> +	u32 vlan_len;
> +	u32 offload_type;
> +	u32 rss_hash;
> +
> +	u32 rsvd[4];
> +} __rte_cache_aligned;
> +#endif
> 
>  struct hinic_rq_cqe_sect {
>  	struct hinic_sge	sge;
> --
> 1.8.3.1
  
Wangxiaoyun (Cloud) Sept. 30, 2019, 2:41 p.m. UTC | #2
Hi Gavin,
      Thanks for your comments.
	
+#if defined(__ARM64_NEON__)

No NEON intrinsics used, maybe RTE_ARCH_ARM64 is better.
In the following line __rte_always_inline is commonly used in DPDK, the effect is same.
/Gavin

For this patch, we don't use NEON intrinsics, but for tx& rx process, we use NEON intrinsics
for wqebb bigend conversion on arm platform, so in order to keep ingres, all intrinsics
optimization on arm we use __ARM64_NEON__ definitation.

I understand your intention is the reading of the status is observed before the following reads.
This can be fulfilled by __atomic_load_n(...) with __ATOMIC_ACQUIRE semantics.
This C11 way applies to all the arches, and you don't need the differentiation of arches.
/Gavin

Thanks, i have changed it to __atomic_load_n(...) with __ATOMIC_ACQUIRE semantics, and send a new patch V3.

Best regards
Xiaoyun Wang
  

在 2019/9/27 10:08, Gavin Hu (Arm Technology China) 写道:
> Hi Xiaoyun,
> 	
>> -----Original Message-----
>> From: dev <dev-bounces@dpdk.org> On Behalf Of Xiaoyun wang
>> Sent: Wednesday, September 25, 2019 10:31 PM
>> To: ferruh.yigit@intel.com
>> Cc: dev@dpdk.org; xuanziyang2@huawei.com; shahar.belkar@huawei.com;
>> luoxianjun@huawei.com; tanya.brokhman@huawei.com;
>> zhouguoyang@huawei.com; wulike1@huawei.com; Xiaoyun wang
>> <cloud.wangxiaoyun@huawei.com>
>> Subject: [dpdk-dev] [PATCH v2 17/17] net/hinic: optimize tx&rx
>> performance
>>
>> This patch optimizes receive packets performance
>> in arm platform.
>>
>> Signed-off-by: Xiaoyun wang <cloud.wangxiaoyun@huawei.com>
>> ---
>>   drivers/net/hinic/hinic_pmd_rx.c | 17 +++++++++++++++++
>>   drivers/net/hinic/hinic_pmd_rx.h | 11 +++++++++++
>>   2 files changed, 28 insertions(+)
>>
>> diff --git a/drivers/net/hinic/hinic_pmd_rx.c
>> b/drivers/net/hinic/hinic_pmd_rx.c
>> index 37b4f5c..94071ee 100644
>> --- a/drivers/net/hinic/hinic_pmd_rx.c
>> +++ b/drivers/net/hinic/hinic_pmd_rx.c
>> @@ -950,6 +950,19 @@ void hinic_rx_alloc_pkts(struct hinic_rxq *rxq)
>>   	}
>>   }
>>
>> +#if defined(__ARM64_NEON__)
> No NEON intrinsics used, maybe RTE_ARCH_ARM64 is better.
> In the following line __rte_always_inline is commonly used in DPDK, the effect is same.
> /Gavin
>
>> +static inline uint32_t __attribute__((always_inline))
>> +hinic_read_cqe_status(uintptr_t addr)
>> +{
>> +	uint32_t val;
>> +
>> +	asm volatile("ldar %x[val], [%x[addr]]"
>> +		: [val] "=r" (val)
>> +		: [addr] "r" (addr));
>> +	return val;
>> +}
>> +#endif
> I understand your intention is the reading of the status is observed before the following reads.
> This can be fulfilled by __atomic_load_n(...) with __ATOMIC_ACQUIRE semantics.
> This C11 way applies to all the arches, and you don't need the differentiation of arches.
> /Gavin
>> +
>>   u16 hinic_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, u16
>> nb_pkts)
>>   {
>>   	struct rte_mbuf *rxm;
>> @@ -972,7 +985,11 @@ u16 hinic_recv_pkts(void *rx_queue, struct
>> rte_mbuf **rx_pkts, u16 nb_pkts)
>>   	while (pkts < nb_pkts) {
>>   		 /* 2. current ci is done */
>>   		rx_cqe = &rxq->rx_cqe[sw_ci];
>> +#if defined(__X86_64_SSE__)
>>   		status = rx_cqe->status;
>> +#elif defined(__ARM64_NEON__)
>> +		status = hinic_read_cqe_status((uintptr_t)&rxq-
>>> rx_cqe[sw_ci]);
>> +#endif
>>   		if (!HINIC_GET_RX_DONE_BE(status))
>>   			break;
>>
>> diff --git a/drivers/net/hinic/hinic_pmd_rx.h
>> b/drivers/net/hinic/hinic_pmd_rx.h
>> index fe2735b..fa27e91 100644
>> --- a/drivers/net/hinic/hinic_pmd_rx.h
>> +++ b/drivers/net/hinic/hinic_pmd_rx.h
>> @@ -28,6 +28,7 @@ struct hinic_rq_ctrl {
>>   	u32	ctrl_fmt;
>>   };
>>
>> +#if defined(__X86_64_SSE__)
>>   struct hinic_rq_cqe {
>>   	u32 status;
>>   	u32 vlan_len;
>> @@ -36,6 +37,16 @@ struct hinic_rq_cqe {
>>
>>   	u32 rsvd[4];
>>   };
>> +#elif defined(__ARM64_NEON__)
>> +struct hinic_rq_cqe {
>> +	u32 status;
>> +	u32 vlan_len;
>> +	u32 offload_type;
>> +	u32 rss_hash;
>> +
>> +	u32 rsvd[4];
>> +} __rte_cache_aligned;
>> +#endif
>>
>>   struct hinic_rq_cqe_sect {
>>   	struct hinic_sge	sge;
>> --
>> 1.8.3.1
>
  

Patch

diff --git a/drivers/net/hinic/hinic_pmd_rx.c b/drivers/net/hinic/hinic_pmd_rx.c
index 37b4f5c..94071ee 100644
--- a/drivers/net/hinic/hinic_pmd_rx.c
+++ b/drivers/net/hinic/hinic_pmd_rx.c
@@ -950,6 +950,19 @@  void hinic_rx_alloc_pkts(struct hinic_rxq *rxq)
 	}
 }
 
+#if defined(__ARM64_NEON__)
+static inline uint32_t __attribute__((always_inline))
+hinic_read_cqe_status(uintptr_t addr)
+{
+	uint32_t val;
+
+	asm volatile("ldar %x[val], [%x[addr]]"
+		: [val] "=r" (val)
+		: [addr] "r" (addr));
+	return val;
+}
+#endif
+
 u16 hinic_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, u16 nb_pkts)
 {
 	struct rte_mbuf *rxm;
@@ -972,7 +985,11 @@  u16 hinic_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, u16 nb_pkts)
 	while (pkts < nb_pkts) {
 		 /* 2. current ci is done */
 		rx_cqe = &rxq->rx_cqe[sw_ci];
+#if defined(__X86_64_SSE__)
 		status = rx_cqe->status;
+#elif defined(__ARM64_NEON__)
+		status = hinic_read_cqe_status((uintptr_t)&rxq->rx_cqe[sw_ci]);
+#endif
 		if (!HINIC_GET_RX_DONE_BE(status))
 			break;
 
diff --git a/drivers/net/hinic/hinic_pmd_rx.h b/drivers/net/hinic/hinic_pmd_rx.h
index fe2735b..fa27e91 100644
--- a/drivers/net/hinic/hinic_pmd_rx.h
+++ b/drivers/net/hinic/hinic_pmd_rx.h
@@ -28,6 +28,7 @@  struct hinic_rq_ctrl {
 	u32	ctrl_fmt;
 };
 
+#if defined(__X86_64_SSE__)
 struct hinic_rq_cqe {
 	u32 status;
 	u32 vlan_len;
@@ -36,6 +37,16 @@  struct hinic_rq_cqe {
 
 	u32 rsvd[4];
 };
+#elif defined(__ARM64_NEON__)
+struct hinic_rq_cqe {
+	u32 status;
+	u32 vlan_len;
+	u32 offload_type;
+	u32 rss_hash;
+
+	u32 rsvd[4];
+} __rte_cache_aligned;
+#endif
 
 struct hinic_rq_cqe_sect {
 	struct hinic_sge	sge;