[dpdk-dev] [PATCH v3 3/6] power: add simple power management API and callback

Liang, Ma liang.j.ma at intel.com
Mon Sep 14 23:01:40 CEST 2020


On 04 Sep 11:33, Ananyev, Konstantin wrote:

<snip>
> > +struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> > +
> > +if (unlikely(nb_rx == 0)) {
> > +dev->empty_poll_stats[qidx].num++;
>
Hi Konstantin,
   Agree, v4 will relocate the meta data to seperate structure. 
   and without touch the rte_ethdev structure. 
> I believe there are two fundamental issues with that approach:
> 1. You put metadata specific lib (power) callbacks into rte_eth_dev struct.
> 2. These callbacks do access rte_eth_devices[] directly.
> That doesn't look right to me - rte_eth_dev structure supposed to be treated
> as internal one librt_ether and underlying drivers and should be accessed directly
> by outer code.
> If these callbacks need some extra metadata, then it is responsibility
> of power library to allocate/manage these metadata.
> You can pass pointer to this metadata via last parameter for rte_eth_add_rx_callback().
> 
> > +if (unlikely(dev->empty_poll_stats[qidx].num >
> > +     ETH_EMPTYPOLL_MAX)) {
> > +volatile void *target_addr;
> > +uint64_t expected, mask;
> > +uint16_t ret;
> > +
> > +/*
> > + * get address of next descriptor in the RX
> > + * ring for this queue, as well as expected
> > + * value and a mask.
> > + */
> > +ret = (*dev->dev_ops->next_rx_desc)
> > +(dev->data->rx_queues[qidx],
> > + &target_addr, &expected, &mask);
> > +if (ret == 0)
> > +/* -1ULL is maximum value for TSC */
> > +rte_power_monitor(target_addr,
> > +  expected, mask,
> > +  0, -1ULL);
> > +}
> > +} else
> > +dev->empty_poll_stats[qidx].num = 0;
> > +
> > +return nb_rx;
> > +}
> > +
> > +static uint16_t
> > +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
> > +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> > +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> > +{
> > +struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> > +
> > +int i;
> > +
> > +if (unlikely(nb_rx == 0)) {
> > +
> > +dev->empty_poll_stats[qidx].num++;
> > +
> > +if (unlikely(dev->empty_poll_stats[qidx].num >
> > +     ETH_EMPTYPOLL_MAX)) {
> > +
> > +for (i = 0; i < RTE_ETH_PAUSE_NUM; i++)
> > +rte_pause();
> > +
> > +}
> > +} else
> > +dev->empty_poll_stats[qidx].num = 0;
> > +
> > +return nb_rx;
> > +}
> > +
> > +static uint16_t
> > +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
> > +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> > +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> > +{
> > +struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> > +
> > +if (unlikely(nb_rx == 0)) {
> > +dev->empty_poll_stats[qidx].num++;
> > +if (unlikely(dev->empty_poll_stats[qidx].num >
> > +     ETH_EMPTYPOLL_MAX)) {
> > +
> > +/*scale down freq */
> > +rte_power_freq_min(rte_lcore_id());
> > +
> > +}
> > +} else {
> > +dev->empty_poll_stats[qidx].num = 0;
> > +/* scal up freq */
> > +rte_power_freq_max(rte_lcore_id());
> > +}
> > +
> > +return nb_rx;
> > +}
> > +
> > +int
> > +rte_power_pmd_mgmt_enable(unsigned int lcore_id,
> > +uint16_t port_id,
> > +enum rte_eth_dev_power_mgmt_cb_mode mode)
> > +{
> > +struct rte_eth_dev *dev;
> > +
> > +RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> > +dev = &rte_eth_devices[port_id];
> > +
> > +if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED)
> > +return -EINVAL;
> > +/* allocate memory for empty poll stats */
> > +dev->empty_poll_stats = rte_malloc_socket(NULL,
> > +  sizeof(struct rte_eth_ep_stat)
> > +  * RTE_MAX_QUEUES_PER_PORT,
> > +  0, dev->data->numa_node);
> > +if (dev->empty_poll_stats == NULL)
> > +return -ENOMEM;
> > +
> > +switch (mode) {
> > +case RTE_ETH_DEV_POWER_MGMT_CB_WAIT:
> > +if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG))
> > +return -ENOTSUP;
> 
> Here and in other places: in case of error return you don't' free your empty_poll_stats.
> 
> > +dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
> 
> Why zero for queue number, why not to pass queue_id as a parameter for that function?
v4 will move to use queue_id instead of 0. v3 still assume only queue 0 is used.
> 
> > +rte_power_mgmt_umwait, NULL);
> 
> As I said above, instead of NULL - could be pointer to metadata struct.
v4 will address this. 
> 
> > +break;
> > +case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
> > +/* init scale freq */
> > +if (rte_power_init(lcore_id))
> > +return -EINVAL;
> > +dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
> > +rte_power_mgmt_scalefreq, NULL);
> > +break;
> > +case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
> > +dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
> > +rte_power_mgmt_pause, NULL);
> > +break;
> > +}
> > +
> > +dev->cb_mode = mode;
> > +dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_ENABLED;
> > +return 0;
> > +}
> > +
> > +int
> > +rte_power_pmd_mgmt_disable(unsigned int lcore_id,
> > +uint16_t port_id)
> > +{
> > +struct rte_eth_dev *dev;
> > +
> > +RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> > +dev = &rte_eth_devices[port_id];
> > +
> > +/*add flag check */
> > +
> > +if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_DISABLED)
> > +return -EINVAL;
> > +
> > +/* rte_free ignores NULL so safe to call without checks */
> > +rte_free(dev->empty_poll_stats);
> 
> You can't free callback metadata before removing the callback itself.
> In fact, with current rx callback code it is not safe to free it
> even after (we discussed it offline).
agree. 
> 
> > +
> > +switch (dev->cb_mode) {
> > +case RTE_ETH_DEV_POWER_MGMT_CB_WAIT:
> > +case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
> > +rte_eth_remove_rx_callback(port_id, 0,
> > +   dev->cur_pwr_cb);
> > +break;
> > +case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
> > +rte_power_freq_max(lcore_id);
> 
> Stupid q: what makes you think that lcore frequency was max,
> *before* you setup the callback?
that is because the rte_power_init() has figured out the system max.
the init code invocate rte_power_init() already. 
> 
> > +rte_eth_remove_rx_callback(port_id, 0,
> > +   dev->cur_pwr_cb);
> > +if (rte_power_exit(lcore_id))
> > +return -EINVAL;
> > +break;
> > +}
> > +
> > +dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_DISABLED;
> > +dev->cur_pwr_cb = NULL;
> > +dev->cb_mode = 0;
> > +
> > +return 0;
> > +}
> > diff --git a/lib/librte_power/rte_power_version.map b/lib/librte_power/rte_power_version.map
> > index 00ee5753e2..ade83cfd4f 100644
> > --- a/lib/librte_power/rte_power_version.map
> > +++ b/lib/librte_power/rte_power_version.map
> > @@ -34,4 +34,8 @@ EXPERIMENTAL {
> >  rte_power_guest_channel_receive_msg;
> >  rte_power_poll_stat_fetch;
> >  rte_power_poll_stat_update;
> > +# added in 20.08
> > +rte_power_pmd_mgmt_disable;
> > +rte_power_pmd_mgmt_enable;
> > +
> >  };
> > --
> > 2.17.1
> 


More information about the dev mailing list