[RFC] net/ixgbe: fix Tx descriptor status api

Message ID 20180625142057.6296-1-olivier.matz@6wind.com (mailing list archive)
State RFC, archived
Delegated to: Qi Zhang
Headers
Series [RFC] net/ixgbe: fix Tx descriptor status api |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Olivier Matz June 25, 2018, 2:20 p.m. UTC
  The Tx descriptor status api was not behaving as expected. This API is
used to inspect the content of the descriptors in the Tx ring to
determine the length of the Tx queue.

Since the software advances the tail pointer and the hardware advances
the head pointer, the Tx queue is located before txq->tx_tail in the
ring. Therefore, a call to rte_eth_tx_descriptor_status(..., offset=20)
should inspect the 20th descriptor before the tail, not after.

As before, we still need to take care about only checking descriptors
that have the RS bit.

Additionally, we can avoid an access to the ring if offset is greater or
equal to nb_tx_desc - nb_tx_free.

Fixes: a2919e13d95e ("net/ixgbe: implement descriptor status API")
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
---

Hi Wei, Hi Qi,

We also recently found some issues in Tx descriptor status API for ixgbe,
i40, e1000, igb. I'm preparing a clean patchset for all of them.

Here is already the patch for ixgbe, please let me know what you think.

The API comment of rte_eth_tx_descriptor_status() is incorrect and should
be fixed too. The reference descriptor (when offset = 0) is not where the
next packet will be sent, but where the latest packet has been enqueued.

Regards,
Olivier



 drivers/net/ixgbe/ixgbe_rxtx.c | 45 +++++++++++++++++++++++++++++++-----------
 drivers/net/ixgbe/ixgbe_rxtx.h |  1 +
 2 files changed, 34 insertions(+), 12 deletions(-)
  

Comments

Olivier Matz June 25, 2018, 2:30 p.m. UTC | #1
Removing wrong mails from the To and Cc. Sorry.

On Mon, Jun 25, 2018 at 04:20:57PM +0200, Olivier Matz wrote:
> The Tx descriptor status api was not behaving as expected. This API is
> used to inspect the content of the descriptors in the Tx ring to
> determine the length of the Tx queue.
> 
> Since the software advances the tail pointer and the hardware advances
> the head pointer, the Tx queue is located before txq->tx_tail in the
> ring. Therefore, a call to rte_eth_tx_descriptor_status(..., offset=20)
> should inspect the 20th descriptor before the tail, not after.
> 
> As before, we still need to take care about only checking descriptors
> that have the RS bit.
> 
> Additionally, we can avoid an access to the ring if offset is greater or
> equal to nb_tx_desc - nb_tx_free.
> 
> Fixes: a2919e13d95e ("net/ixgbe: implement descriptor status API")
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
> ---
> 
> Hi Wei, Hi Qi,
> 
> We also recently found some issues in Tx descriptor status API for ixgbe,
> i40, e1000, igb. I'm preparing a clean patchset for all of them.
> 
> Here is already the patch for ixgbe, please let me know what you think.
> 
> The API comment of rte_eth_tx_descriptor_status() is incorrect and should
> be fixed too. The reference descriptor (when offset = 0) is not where the
> next packet will be sent, but where the latest packet has been enqueued.
> 
> Regards,
> Olivier
> 
> 
> 
>  drivers/net/ixgbe/ixgbe_rxtx.c | 45 +++++++++++++++++++++++++++++++-----------
>  drivers/net/ixgbe/ixgbe_rxtx.h |  1 +
>  2 files changed, 34 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index 3e13d26ae..384587cc6 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -2606,10 +2606,15 @@ ixgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
>  	    hw->mac.type == ixgbe_mac_X540_vf ||
>  	    hw->mac.type == ixgbe_mac_X550_vf ||
>  	    hw->mac.type == ixgbe_mac_X550EM_x_vf ||
> -	    hw->mac.type == ixgbe_mac_X550EM_a_vf)
> +	    hw->mac.type == ixgbe_mac_X550EM_a_vf) {
>  		txq->tdt_reg_addr = IXGBE_PCI_REG_ADDR(hw, IXGBE_VFTDT(queue_idx));
> -	else
> +		txq->tdh_reg_addr = IXGBE_PCI_REG_ADDR(hw,
> +				IXGBE_VFTDH(queue_idx));
> +	} else {
>  		txq->tdt_reg_addr = IXGBE_PCI_REG_ADDR(hw, IXGBE_TDT(txq->reg_idx));
> +		txq->tdh_reg_addr = IXGBE_PCI_REG_ADDR(hw,
> +				IXGBE_TDH(txq->reg_idx));
> +	}
>  
>  	txq->tx_ring_phys_addr = tz->iova;
>  	txq->tx_ring = (union ixgbe_adv_tx_desc *) tz->addr;
> @@ -3140,22 +3145,38 @@ ixgbe_dev_tx_descriptor_status(void *tx_queue, uint16_t offset)
>  {
>  	struct ixgbe_tx_queue *txq = tx_queue;
>  	volatile uint32_t *status;
> -	uint32_t desc;
> +	int32_t desc, dd;
>  
>  	if (unlikely(offset >= txq->nb_tx_desc))
>  		return -EINVAL;
> +	if (offset >= txq->nb_tx_desc - txq->nb_tx_free)
> +		return RTE_ETH_TX_DESC_DONE;
> +
> +	desc = txq->tx_tail - offset - 1;
> +	if (desc < 0)
> +		desc += txq->nb_tx_desc;
>  
> -	desc = txq->tx_tail + offset;
> -	/* go to next desc that has the RS bit */
> -	desc = ((desc + txq->tx_rs_thresh - 1) / txq->tx_rs_thresh) *
> -		txq->tx_rs_thresh;
> -	if (desc >= txq->nb_tx_desc) {
> -		desc -= txq->nb_tx_desc;
> -		if (desc >= txq->nb_tx_desc)
> -			desc -= txq->nb_tx_desc;
> +	/* offset is too small, no other way than reading PCI reg */
> +	if (unlikely(offset < txq->tx_rs_thresh)) {
> +		int16_t tx_head, queue_size;
> +		tx_head = ixgbe_read_addr(txq->tdh_reg_addr);
> +		queue_size = txq->tx_tail - tx_head;
> +		if (queue_size < 0)
> +			queue_size += txq->nb_tx_desc;
> +		return queue_size > offset ? RTE_ETH_TX_DESC_FULL :
> +			RTE_ETH_TX_DESC_DONE;
>  	}
>  
> -	status = &txq->tx_ring[desc].wb.status;
> +	/* index of the dd bit to look at */
> +	dd = (desc / txq->tx_rs_thresh + 1) * txq->tx_rs_thresh - 1;
> +
> +	/* In full featured mode, RS bit is only set in the last descriptor */
> +	/* of a multisegments packet */
> +	if (!((txq->offloads == 0) &&
> +	      (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)))
> +		dd = txq->sw_ring[dd].last_id;
> +
> +	status = &txq->tx_ring[dd].wb.status;
>  	if (*status & rte_cpu_to_le_32(IXGBE_ADVTXD_STAT_DD))
>  		return RTE_ETH_TX_DESC_DONE;
>  
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
> index 39378f754..384f6324d 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.h
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.h
> @@ -201,6 +201,7 @@ struct ixgbe_tx_queue {
>  		struct ixgbe_tx_entry_v *sw_ring_v; /**< address of SW ring for vector PMD */
>  	};
>  	volatile uint32_t   *tdt_reg_addr; /**< Address of TDT register. */
> +	volatile uint32_t   *tdh_reg_addr; /**< Address of TDH register. */
>  	uint16_t            nb_tx_desc;    /**< number of TX descriptors. */
>  	uint16_t            tx_tail;       /**< current value of TDT reg. */
>  	/**< Start freeing TX buffers if there are less free descriptors than
> -- 
> 2.11.0
>
  
Zhao1, Wei June 26, 2018, 1:38 a.m. UTC | #2
Hi,  Olivier Matz

 Will you commit fix patch for i40e and ixgbe and em?
And the code " dd = (desc / txq->tx_rs_thresh + 1) * txq->tx_rs_thresh - 1;"
Is only proper for tx function ixgbe_xmit_pkts_simple  and ixgbe_xmit_pkts_vec ().
But not proper for ixgbe_xmit_pkts (), the RS bit set rule is different from all these two. 


> -----Original Message-----
> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Monday, June 25, 2018 10:21 PM
> To: Zhao1@6wind.com; Zhao1, Wei <wei.zhao1@intel.com>;
> Zhang@6wind.com; Zhang, Qi Z <qi.z.zhang@intel.com>
> Cc: dev@dpdk.org; Lu@6wind.com; Lu, Wenzhuo <wenzhuo.lu@intel.com>
> Subject: [RFC] net/ixgbe: fix Tx descriptor status api
> 
> The Tx descriptor status api was not behaving as expected. This API is used to
> inspect the content of the descriptors in the Tx ring to determine the length
> of the Tx queue.
> 
> Since the software advances the tail pointer and the hardware advances the
> head pointer, the Tx queue is located before txq->tx_tail in the ring.
> Therefore, a call to rte_eth_tx_descriptor_status(..., offset=20) should
> inspect the 20th descriptor before the tail, not after.
> 
> As before, we still need to take care about only checking descriptors that
> have the RS bit.
> 
> Additionally, we can avoid an access to the ring if offset is greater or equal to
> nb_tx_desc - nb_tx_free.
> 
> Fixes: a2919e13d95e ("net/ixgbe: implement descriptor status API")
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
> ---
> 
> Hi Wei, Hi Qi,
> 
> We also recently found some issues in Tx descriptor status API for ixgbe, i40,
> e1000, igb. I'm preparing a clean patchset for all of them.
> 
> Here is already the patch for ixgbe, please let me know what you think.
> 
> The API comment of rte_eth_tx_descriptor_status() is incorrect and should
> be fixed too. The reference descriptor (when offset = 0) is not where the
> next packet will be sent, but where the latest packet has been enqueued.
> 
> Regards,
> Olivier
> 
> 
> 
>  drivers/net/ixgbe/ixgbe_rxtx.c | 45
> +++++++++++++++++++++++++++++++-----------
>  drivers/net/ixgbe/ixgbe_rxtx.h |  1 +
>  2 files changed, 34 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index 3e13d26ae..384587cc6 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -2606,10 +2606,15 @@ ixgbe_dev_tx_queue_setup(struct rte_eth_dev
> *dev,
>  	    hw->mac.type == ixgbe_mac_X540_vf ||
>  	    hw->mac.type == ixgbe_mac_X550_vf ||
>  	    hw->mac.type == ixgbe_mac_X550EM_x_vf ||
> -	    hw->mac.type == ixgbe_mac_X550EM_a_vf)
> +	    hw->mac.type == ixgbe_mac_X550EM_a_vf) {
>  		txq->tdt_reg_addr = IXGBE_PCI_REG_ADDR(hw,
> IXGBE_VFTDT(queue_idx));
> -	else
> +		txq->tdh_reg_addr = IXGBE_PCI_REG_ADDR(hw,
> +				IXGBE_VFTDH(queue_idx));
> +	} else {
>  		txq->tdt_reg_addr = IXGBE_PCI_REG_ADDR(hw,
> IXGBE_TDT(txq->reg_idx));
> +		txq->tdh_reg_addr = IXGBE_PCI_REG_ADDR(hw,
> +				IXGBE_TDH(txq->reg_idx));
> +	}
> 
>  	txq->tx_ring_phys_addr = tz->iova;
>  	txq->tx_ring = (union ixgbe_adv_tx_desc *) tz->addr; @@ -3140,22
> +3145,38 @@ ixgbe_dev_tx_descriptor_status(void *tx_queue, uint16_t
> offset)  {
>  	struct ixgbe_tx_queue *txq = tx_queue;
>  	volatile uint32_t *status;
> -	uint32_t desc;
> +	int32_t desc, dd;
> 
>  	if (unlikely(offset >= txq->nb_tx_desc))
>  		return -EINVAL;
> +	if (offset >= txq->nb_tx_desc - txq->nb_tx_free)
> +		return RTE_ETH_TX_DESC_DONE;
> +
> +	desc = txq->tx_tail - offset - 1;
> +	if (desc < 0)
> +		desc += txq->nb_tx_desc;
> 
> -	desc = txq->tx_tail + offset;
> -	/* go to next desc that has the RS bit */
> -	desc = ((desc + txq->tx_rs_thresh - 1) / txq->tx_rs_thresh) *
> -		txq->tx_rs_thresh;
> -	if (desc >= txq->nb_tx_desc) {
> -		desc -= txq->nb_tx_desc;
> -		if (desc >= txq->nb_tx_desc)
> -			desc -= txq->nb_tx_desc;
> +	/* offset is too small, no other way than reading PCI reg */
> +	if (unlikely(offset < txq->tx_rs_thresh)) {
> +		int16_t tx_head, queue_size;
> +		tx_head = ixgbe_read_addr(txq->tdh_reg_addr);
> +		queue_size = txq->tx_tail - tx_head;
> +		if (queue_size < 0)
> +			queue_size += txq->nb_tx_desc;
> +		return queue_size > offset ? RTE_ETH_TX_DESC_FULL :
> +			RTE_ETH_TX_DESC_DONE;
>  	}
> 
> -	status = &txq->tx_ring[desc].wb.status;
> +	/* index of the dd bit to look at */
> +	dd = (desc / txq->tx_rs_thresh + 1) * txq->tx_rs_thresh - 1;
> +
> +	/* In full featured mode, RS bit is only set in the last descriptor */
> +	/* of a multisegments packet */
> +	if (!((txq->offloads == 0) &&
> +	      (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)))
> +		dd = txq->sw_ring[dd].last_id;
> +
> +	status = &txq->tx_ring[dd].wb.status;
>  	if (*status & rte_cpu_to_le_32(IXGBE_ADVTXD_STAT_DD))
>  		return RTE_ETH_TX_DESC_DONE;
> 
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
> index 39378f754..384f6324d 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.h
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.h
> @@ -201,6 +201,7 @@ struct ixgbe_tx_queue {
>  		struct ixgbe_tx_entry_v *sw_ring_v; /**< address of SW ring
> for vector PMD */
>  	};
>  	volatile uint32_t   *tdt_reg_addr; /**< Address of TDT register. */
> +	volatile uint32_t   *tdh_reg_addr; /**< Address of TDH register. */
>  	uint16_t            nb_tx_desc;    /**< number of TX descriptors. */
>  	uint16_t            tx_tail;       /**< current value of TDT reg. */
>  	/**< Start freeing TX buffers if there are less free descriptors than
> --
> 2.11.0
  
Olivier Matz June 26, 2018, 8:46 a.m. UTC | #3
Hi Wei,

On Tue, Jun 26, 2018 at 01:38:22AM +0000, Zhao1, Wei wrote:
> Hi,  Olivier Matz
> 
>  Will you commit fix patch for i40e and ixgbe and em?

If you think the patch are relevant, yes :)

Here is a pre-version (last 5 patches):
http://git.droids-corp.org/?p=dpdk.git;a=shortlog;h=refs/heads/tx-desc

It still need to fix checkpatch issues, few more tests, and
rebase on next-net.

> And the code " dd = (desc / txq->tx_rs_thresh + 1) * txq->tx_rs_thresh - 1;"
> Is only proper for tx function ixgbe_xmit_pkts_simple  and ixgbe_xmit_pkts_vec ().
> But not proper for ixgbe_xmit_pkts (), the RS bit set rule is different from all these two. 

Can you please give more detail please?

Note this code, maybe you are talking about this?

+       /* In full featured mode, RS bit is only set in the last descriptor */
+       /* of a multisegments packet */
+       if (!((txq->offloads == 0) &&
+             (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)))
+               dd = txq->sw_ring[dd].last_id;

Maybe there is something better to test?

Just to ensure we are on the same line, here are some more infos.

===

- sw advances the tail pointer
- hw advances the head pointer
- the software populates the ring with full buffers to be sent by
  the hw
- head points to the in-progress descriptor.
- sw writes new descriptors at tail
- head == tail means that the transmit queue is empty
- when the hw has processed a descriptor, it sets the DD bit if
  the descriptor has the RS (report status) bit.
- the driver never reads the head (needs a pci transaction), instead
it monitors the DD bit of a descriptor that has the RS bit

txq->tx_tail: sw value for tail register
txq->tx_free_thresh: free buffers if count(free descs) < this value
txq->tx_rs_thresh: RS bit is set every rs_thresh descriptor
txq->tx_next_dd: next desc to scan for DD bit
txq->tx_next_rs: next desc to set RS bit
txq->last_desc_cleaned: last descriptor that have been cleaned
txq->nb_tx_free: number of free descriptors

Example:

|----------------------------------------------------------------|
|               D       R       R       R                        |
|        ............xxxxxxxxxxxxxxxxxxxxxxxxx                   |
|        <descs sent><- descs not sent yet  ->                   |
|        ............xxxxxxxxxxxxxxxxxxxxxxxxx                   |
|----------------------------------------------------------------|
        ^last_desc_cleaned=8                    ^next_rs=47
                ^next_dd=15                   ^sw_tail=45
                     ^hw_head=20

                     <----  nb_used  --------->

The hardware is currently processing the descriptor 20
'R' means the descriptor has the RS bit
'D' means the descriptor has the DD + RS bits
'x' are packets in txq (not sent)
'.' are packet already sent but not freed by sw

In this example, we have rs_thres=8. On next call to
ixgbe_tx_free_bufs(), some buffers will be freed.

===

Let's call ixgbe_dev_tx_descriptor_status(10):


- original version:

desc = 45 + 10 = 55
desc = ((55 + 8 - 1) / 8) * 8 = (62 / 8) * 8 = 56

wrong because it goes in the wrong direction, and because
56 does not have the RS bit

- after your patch:

desc = 45 + 10 = 55
desc = (((55 / 8) + 1) * 8) - 1 = (7 * 8) - 1 = 55

wrong because it goes in the wrong direction

- after my patch

desc = 45 - 10 - 1 = 34
desc = (((34 / 8) + 1) * 8) - 1 = (5 * 8) - 1 = 39

looks correct



Regards,
Olivier
  
Zhao1, Wei June 27, 2018, 2:07 a.m. UTC | #4
Hi, Olivier Matz

> -----Original Message-----
> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Tuesday, June 26, 2018 4:46 PM
> To: Zhao1, Wei <wei.zhao1@intel.com>
> Cc: Zhang, Qi Z <qi.z.zhang@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>
> Subject: Re: [RFC] net/ixgbe: fix Tx descriptor status api
> 
> Hi Wei,
> 
> On Tue, Jun 26, 2018 at 01:38:22AM +0000, Zhao1, Wei wrote:
> > Hi,  Olivier Matz
> >
> >  Will you commit fix patch for i40e and ixgbe and em?
> 
> If you think the patch are relevant, yes :)
> 
> Here is a pre-version (last 5 patches):
> http://git.droids-corp.org/?p=dpdk.git;a=shortlog;h=refs/heads/tx-desc
> 
> It still need to fix checkpatch issues, few more tests, and rebase on next-net.
> 
> > And the code " dd = (desc / txq->tx_rs_thresh + 1) * txq->tx_rs_thresh -
> 1;"
> > Is only proper for tx function ixgbe_xmit_pkts_simple  and
> ixgbe_xmit_pkts_vec ().
> > But not proper for ixgbe_xmit_pkts (), the RS bit set rule is different from
> all these two.
> 
> Can you please give more detail please?
> 
> Note this code, maybe you are talking about this?


https://mails.dpdk.org/archives/dev/2018-June/105053.html
This reminder is from zhangqi to my patch, he seems reasonable.
He has find that in tx function  ixgbe_xmit_pkts (), it will deal with some packets with several segments.
In this case, txq->nb_tx_used = 0 will be set after set RS bit even if the last packet span several segements, but 
" dd = (desc / txq->tx_rs_thresh + 1) * txq->tx_rs_thresh -1" does not considering this problem.
Although we can change code to "txq->nb_tx_used = 0;  ------>  txq->nb_tx_used = txq->nb_tx_used - txq->tx_rs_thresh"
But that will cause problem in function ixgbe_xmit_cleanup().
So, I agree with zhangqi for this point.


> 
> +       /* In full featured mode, RS bit is only set in the last descriptor */
> +       /* of a multisegments packet */
> +       if (!((txq->offloads == 0) &&
> +             (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)))
> +               dd = txq->sw_ring[dd].last_id;
> 
> Maybe there is something better to test?
> 
> Just to ensure we are on the same line, here are some more infos.
> 
> ===
> 
> - sw advances the tail pointer
> - hw advances the head pointer
> - the software populates the ring with full buffers to be sent by
>   the hw
> - head points to the in-progress descriptor.
> - sw writes new descriptors at tail
> - head == tail means that the transmit queue is empty
> - when the hw has processed a descriptor, it sets the DD bit if
>   the descriptor has the RS (report status) bit.
> - the driver never reads the head (needs a pci transaction), instead it
> monitors the DD bit of a descriptor that has the RS bit
> 
> txq->tx_tail: sw value for tail register
> txq->tx_free_thresh: free buffers if count(free descs) < this value
> txq->tx_rs_thresh: RS bit is set every rs_thresh descriptor
> txq->tx_next_dd: next desc to scan for DD bit
> txq->tx_next_rs: next desc to set RS bit
> txq->last_desc_cleaned: last descriptor that have been cleaned
> txq->nb_tx_free: number of free descriptors
> 
> Example:
> 
> |----------------------------------------------------------------|
> |               D       R       R       R                        |
> |        ............xxxxxxxxxxxxxxxxxxxxxxxxx                   |
> |        <descs sent><- descs not sent yet  ->                   |
> |        ............xxxxxxxxxxxxxxxxxxxxxxxxx                   |
> |----------------------------------------------------------------|
>         ^last_desc_cleaned=8                    ^next_rs=47
>                 ^next_dd=15                   ^sw_tail=45
>                      ^hw_head=20
> 
>                      <----  nb_used  --------->
> 
> The hardware is currently processing the descriptor 20 'R' means the
> descriptor has the RS bit 'D' means the descriptor has the DD + RS bits 'x' are
> packets in txq (not sent) '.' are packet already sent but not freed by sw
> 
> In this example, we have rs_thres=8. On next call to ixgbe_tx_free_bufs(),
> some buffers will be freed.
> 
> ===
> 
> Let's call ixgbe_dev_tx_descriptor_status(10):
> 
> 
> - original version:
> 
> desc = 45 + 10 = 55
> desc = ((55 + 8 - 1) / 8) * 8 = (62 / 8) * 8 = 56
> 
> wrong because it goes in the wrong direction, and because
> 56 does not have the RS bit
> 
> - after your patch:
> 
> desc = 45 + 10 = 55
> desc = (((55 / 8) + 1) * 8) - 1 = (7 * 8) - 1 = 55
> 
> wrong because it goes in the wrong direction
> 
> - after my patch
> 
> desc = 45 - 10 - 1 = 34
> desc = (((34 / 8) + 1) * 8) - 1 = (5 * 8) - 1 = 39
> 
> looks correct

It seems you have change the definition of "offset", it is before tail not after.
My patch is based on the old comment in RTE function. If so, your patch will be ok.
BTW, last_desc_cleaned should be 7, not 8.


> 
> 
> 
> Regards,
> Olivier
  
Qi Zhang June 27, 2018, 1:38 p.m. UTC | #5
Hi Oliver

> -----Original Message-----
> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Tuesday, June 26, 2018 4:46 PM
> To: Zhao1, Wei <wei.zhao1@intel.com>
> Cc: Zhang, Qi Z <qi.z.zhang@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>
> Subject: Re: [RFC] net/ixgbe: fix Tx descriptor status api
> 
> Hi Wei,
> 
> On Tue, Jun 26, 2018 at 01:38:22AM +0000, Zhao1, Wei wrote:
> > Hi,  Olivier Matz
> >
> >  Will you commit fix patch for i40e and ixgbe and em?
> 
> If you think the patch are relevant, yes :)
> 
> Here is a pre-version (last 5 patches):
> http://git.droids-corp.org/?p=dpdk.git;a=shortlog;h=refs/heads/tx-desc
> 
> It still need to fix checkpatch issues, few more tests, and rebase on next-net.
> 
> > And the code " dd = (desc / txq->tx_rs_thresh + 1) * txq->tx_rs_thresh - 1;"
> > Is only proper for tx function ixgbe_xmit_pkts_simple  and
> ixgbe_xmit_pkts_vec ().
> > But not proper for ixgbe_xmit_pkts (), the RS bit set rule is different from all
> these two.
> 
> Can you please give more detail please?

I think the problem in ixgbe_xmit_pkts is, we cannot guarantee tx_rs_thresh *n -1 will always get RS bit.
Because the next tx_rs_thresh cycle counted start from the last segment of the packet that cross the boundary, it could be any value. 

So probably we should return -ENOSUP if pkt_tx_burst = ixgbe_xmit_pkts,  or we need to change the method that write RS in ixgbe_xmit_pkts.

Btw, yes, we should look before the tail but not after, you fix is correct.

Thanks
Qi  

> 
> Note this code, maybe you are talking about this?
> 
> +       /* In full featured mode, RS bit is only set in the last descriptor */
> +       /* of a multisegments packet */
> +       if (!((txq->offloads == 0) &&
> +             (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)))
> +               dd = txq->sw_ring[dd].last_id;
> 
> Maybe there is something better to test?
> 
> Just to ensure we are on the same line, here are some more infos.
> 
> ===
> 
> - sw advances the tail pointer
> - hw advances the head pointer
> - the software populates the ring with full buffers to be sent by
>   the hw
> - head points to the in-progress descriptor.
> - sw writes new descriptors at tail
> - head == tail means that the transmit queue is empty
> - when the hw has processed a descriptor, it sets the DD bit if
>   the descriptor has the RS (report status) bit.
> - the driver never reads the head (needs a pci transaction), instead it monitors
> the DD bit of a descriptor that has the RS bit
> 
> txq->tx_tail: sw value for tail register
> txq->tx_free_thresh: free buffers if count(free descs) < this value
> txq->tx_rs_thresh: RS bit is set every rs_thresh descriptor
> txq->tx_next_dd: next desc to scan for DD bit
> txq->tx_next_rs: next desc to set RS bit
> txq->last_desc_cleaned: last descriptor that have been cleaned
> txq->nb_tx_free: number of free descriptors
> 
> Example:
> 
> |----------------------------------------------------------------|
> |               D       R       R       R
> |
> |        ............xxxxxxxxxxxxxxxxxxxxxxxxx                   |
> |        <descs sent><- descs not sent yet  ->                   |
> |        ............xxxxxxxxxxxxxxxxxxxxxxxxx                   |
> |----------------------------------------------------------------|
>         ^last_desc_cleaned=8                    ^next_rs=47
>                 ^next_dd=15                   ^sw_tail=45
>                      ^hw_head=20
> 
>                      <----  nb_used  --------->
> 
> The hardware is currently processing the descriptor 20 'R' means the
> descriptor has the RS bit 'D' means the descriptor has the DD + RS bits 'x' are
> packets in txq (not sent) '.' are packet already sent but not freed by sw
> 
> In this example, we have rs_thres=8. On next call to ixgbe_tx_free_bufs(),
> some buffers will be freed.
> 
> ===
> 
> Let's call ixgbe_dev_tx_descriptor_status(10):
> 
> 
> - original version:
> 
> desc = 45 + 10 = 55
> desc = ((55 + 8 - 1) / 8) * 8 = (62 / 8) * 8 = 56
> 
> wrong because it goes in the wrong direction, and because
> 56 does not have the RS bit
> 
> - after your patch:
> 
> desc = 45 + 10 = 55
> desc = (((55 / 8) + 1) * 8) - 1 = (7 * 8) - 1 = 55
> 
> wrong because it goes in the wrong direction
> 
> - after my patch
> 
> desc = 45 - 10 - 1 = 34
> desc = (((34 / 8) + 1) * 8) - 1 = (5 * 8) - 1 = 39
> 
> looks correct
> 
> 
> 
> Regards,
> Olivier
  

Patch

diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 3e13d26ae..384587cc6 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -2606,10 +2606,15 @@  ixgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	    hw->mac.type == ixgbe_mac_X540_vf ||
 	    hw->mac.type == ixgbe_mac_X550_vf ||
 	    hw->mac.type == ixgbe_mac_X550EM_x_vf ||
-	    hw->mac.type == ixgbe_mac_X550EM_a_vf)
+	    hw->mac.type == ixgbe_mac_X550EM_a_vf) {
 		txq->tdt_reg_addr = IXGBE_PCI_REG_ADDR(hw, IXGBE_VFTDT(queue_idx));
-	else
+		txq->tdh_reg_addr = IXGBE_PCI_REG_ADDR(hw,
+				IXGBE_VFTDH(queue_idx));
+	} else {
 		txq->tdt_reg_addr = IXGBE_PCI_REG_ADDR(hw, IXGBE_TDT(txq->reg_idx));
+		txq->tdh_reg_addr = IXGBE_PCI_REG_ADDR(hw,
+				IXGBE_TDH(txq->reg_idx));
+	}
 
 	txq->tx_ring_phys_addr = tz->iova;
 	txq->tx_ring = (union ixgbe_adv_tx_desc *) tz->addr;
@@ -3140,22 +3145,38 @@  ixgbe_dev_tx_descriptor_status(void *tx_queue, uint16_t offset)
 {
 	struct ixgbe_tx_queue *txq = tx_queue;
 	volatile uint32_t *status;
-	uint32_t desc;
+	int32_t desc, dd;
 
 	if (unlikely(offset >= txq->nb_tx_desc))
 		return -EINVAL;
+	if (offset >= txq->nb_tx_desc - txq->nb_tx_free)
+		return RTE_ETH_TX_DESC_DONE;
+
+	desc = txq->tx_tail - offset - 1;
+	if (desc < 0)
+		desc += txq->nb_tx_desc;
 
-	desc = txq->tx_tail + offset;
-	/* go to next desc that has the RS bit */
-	desc = ((desc + txq->tx_rs_thresh - 1) / txq->tx_rs_thresh) *
-		txq->tx_rs_thresh;
-	if (desc >= txq->nb_tx_desc) {
-		desc -= txq->nb_tx_desc;
-		if (desc >= txq->nb_tx_desc)
-			desc -= txq->nb_tx_desc;
+	/* offset is too small, no other way than reading PCI reg */
+	if (unlikely(offset < txq->tx_rs_thresh)) {
+		int16_t tx_head, queue_size;
+		tx_head = ixgbe_read_addr(txq->tdh_reg_addr);
+		queue_size = txq->tx_tail - tx_head;
+		if (queue_size < 0)
+			queue_size += txq->nb_tx_desc;
+		return queue_size > offset ? RTE_ETH_TX_DESC_FULL :
+			RTE_ETH_TX_DESC_DONE;
 	}
 
-	status = &txq->tx_ring[desc].wb.status;
+	/* index of the dd bit to look at */
+	dd = (desc / txq->tx_rs_thresh + 1) * txq->tx_rs_thresh - 1;
+
+	/* In full featured mode, RS bit is only set in the last descriptor */
+	/* of a multisegments packet */
+	if (!((txq->offloads == 0) &&
+	      (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)))
+		dd = txq->sw_ring[dd].last_id;
+
+	status = &txq->tx_ring[dd].wb.status;
 	if (*status & rte_cpu_to_le_32(IXGBE_ADVTXD_STAT_DD))
 		return RTE_ETH_TX_DESC_DONE;
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
index 39378f754..384f6324d 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.h
+++ b/drivers/net/ixgbe/ixgbe_rxtx.h
@@ -201,6 +201,7 @@  struct ixgbe_tx_queue {
 		struct ixgbe_tx_entry_v *sw_ring_v; /**< address of SW ring for vector PMD */
 	};
 	volatile uint32_t   *tdt_reg_addr; /**< Address of TDT register. */
+	volatile uint32_t   *tdh_reg_addr; /**< Address of TDH register. */
 	uint16_t            nb_tx_desc;    /**< number of TX descriptors. */
 	uint16_t            tx_tail;       /**< current value of TDT reg. */
 	/**< Start freeing TX buffers if there are less free descriptors than