1. int
  2. MAIN(int argc, char **argv)
  3. {
  4. struct lcore_queue_conf *qconf;
  5. struct rte_eth_dev_info dev_info;
  6. int ret;
  7. uint8_t nb_ports;
  8. uint8_t nb_ports_available;
  9. uint8_t portid, last_port;
  10. unsigned lcore_id, rx_lcore_id;
  11. unsigned nb_ports_in_mask = 0;
  12.  
  13. /* init EAL */
  14. ret = rte_eal_init(argc, argv);
  15. if (ret < 0)
  16. rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
  17. argc -= ret;
  18. argv += ret;
  19.  
  20. /* parse application arguments (after the EAL ones) */
  21. ret = l2fwd_parse_args(argc, argv);
  22. if (ret < 0)
  23. rte_exit(EXIT_FAILURE, "Invalid L2FWD arguments\n");
  24.  
  25. /* create the mbuf pool */
  26. l2fwd_pktmbuf_pool =
  27. rte_mempool_create("mbuf_pool", NB_MBUF,
  28. MBUF_SIZE, 32,
  29. sizeof(struct rte_pktmbuf_pool_private),
  30. rte_pktmbuf_pool_init, NULL,
  31. rte_pktmbuf_init, NULL,
  32. rte_socket_id(), 0);
  33. if (l2fwd_pktmbuf_pool == NULL)
  34. rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
  35.  
  36. /* init driver(s) */
  37. if (rte_pmd_init_all() < 0)
  38. rte_exit(EXIT_FAILURE, "Cannot init pmd\n");
  39.  
  40. if (rte_eal_pci_probe() < 0)
  41. rte_exit(EXIT_FAILURE, "Cannot probe PCI\n");
  42.  
  43. nb_ports = rte_eth_dev_count();
  44. if (nb_ports == 0)
  45. rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
  46.  
  47. if (nb_ports > RTE_MAX_ETHPORTS)
  48. nb_ports = RTE_MAX_ETHPORTS;
  49.  
  50. /* reset l2fwd_dst_ports */
  51. for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++)
  52. l2fwd_dst_ports[portid] = 0;
  53. last_port = 0;
  54.  
  55. /* port0发给port1, port1发给port0. 两个端口为一对,互相发包 */
  56. /*
  57. * Each logical core is assigned a dedicated TX queue on each port.
  58. */
  59. for (portid = 0; portid < nb_ports; portid++) {
  60. /* skip ports that are not enabled */
  61. if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
  62. continue;
  63.  
  64. if (nb_ports_in_mask % 2) {
  65. l2fwd_dst_ports[portid] = last_port;
  66. l2fwd_dst_ports[last_port] = portid;
  67. }
  68. else
  69. last_port = portid;
  70.  
  71. nb_ports_in_mask++;
  72.  
  73. rte_eth_dev_info_get(portid, &dev_info);
  74. }
  75. if (nb_ports_in_mask % 2) {
  76. printf("Notice: odd number of ports in portmask.\n");
  77. l2fwd_dst_ports[last_port] = last_port;
  78. }
  79.  
  80. rx_lcore_id = 0;
  81. qconf = NULL;
  82.  
  83. /* 每一个core负责收l2fwd_rx_queue_per_lcore个端口, 每一个端口(事实上应该是QUEUE,由于这里一个port仅仅有一个QUEUE)仅仅能由一个lcore进行收包 */
  84. /* Initialize the port/queue configuration of each logical core */
  85. for (portid = 0; portid < nb_ports; portid++) {
  86. /* skip ports that are not enabled */
  87. if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
  88. continue;
  89.  
  90. /* get the lcore_id for this port */
  91. while (rte_lcore_is_enabled(rx_lcore_id) == 0 ||
  92. lcore_queue_conf[rx_lcore_id].n_rx_port ==
  93. l2fwd_rx_queue_per_lcore) {
  94. rx_lcore_id++;
  95. if (rx_lcore_id >= RTE_MAX_LCORE)
  96. rte_exit(EXIT_FAILURE, "Not enough cores\n");
  97. }
  98.  
  99. if (qconf != &lcore_queue_conf[rx_lcore_id])
  100. /* Assigned a new logical core in the loop above. */
  101. qconf = &lcore_queue_conf[rx_lcore_id];
  102.  
  103. qconf->rx_port_list[qconf->n_rx_port] = portid;
  104. qconf->n_rx_port++;
  105. printf("Lcore %u: RX port %u\n", rx_lcore_id, (unsigned) portid);
  106. }
  107.  
  108. nb_ports_available = nb_ports;
  109.  
  110. /* 每一个port收发包队列的初始化 */
  111. /* Initialise each port */
  112. for (portid = 0; portid < nb_ports; portid++) {
  113. /* skip ports that are not enabled */
  114. if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {
  115. printf("Skipping disabled port %u\n", (unsigned) portid);
  116. nb_ports_available--;
  117. continue;
  118. }
  119. /* init port */
  120. printf("Initializing port %u... ", (unsigned) portid);
  121. fflush(stdout);
  122. ret = rte_eth_dev_configure(portid, 1, 1, &port_conf);
  123. if (ret < 0)
  124. rte_exit(EXIT_FAILURE, "Cannot configure device: err=%d, port=%u\n",
  125. ret, (unsigned) portid);
  126.  
  127. rte_eth_macaddr_get(portid,&l2fwd_ports_eth_addr[portid]);
  128.  
  129. /* init one RX queue */
  130. fflush(stdout);
  131. ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
  132. rte_eth_dev_socket_id(portid), &rx_conf,
  133. l2fwd_pktmbuf_pool);
  134. if (ret < 0)
  135. rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:err=%d, port=%u\n",
  136. ret, (unsigned) portid);
  137.  
  138. /* init one TX queue on each port */
  139. fflush(stdout);
  140. ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
  141. rte_eth_dev_socket_id(portid), &tx_conf);
  142. if (ret < 0)
  143. rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:err=%d, port=%u\n",
  144. ret, (unsigned) portid);
  145.  
  146. /* Start device */
  147. ret = rte_eth_dev_start(portid);
  148. if (ret < 0)
  149. rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",
  150. ret, (unsigned) portid);
  151.  
  152. printf("done: \n");
  153.  
  154. rte_eth_promiscuous_enable(portid);
  155.  
  156. printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",
  157. (unsigned) portid,
  158. l2fwd_ports_eth_addr[portid].addr_bytes[0],
  159. l2fwd_ports_eth_addr[portid].addr_bytes[1],
  160. l2fwd_ports_eth_addr[portid].addr_bytes[2],
  161. l2fwd_ports_eth_addr[portid].addr_bytes[3],
  162. l2fwd_ports_eth_addr[portid].addr_bytes[4],
  163. l2fwd_ports_eth_addr[portid].addr_bytes[5]);
  164.  
  165. /* initialize port stats */
  166. memset(&port_statistics, 0, sizeof(port_statistics));
  167. }
  168.  
  169. if (!nb_ports_available) {
  170. rte_exit(EXIT_FAILURE,
  171. "All available ports are disabled. Please set portmask.\n");
  172. }
  173.  
  174. check_all_ports_link_status(nb_ports, l2fwd_enabled_port_mask);
  175.  
  176. /* 启动l2fwd线程 */
  177. /* launch per-lcore init on every lcore */
  178. rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);
  179. RTE_LCORE_FOREACH_SLAVE(lcore_id) {
  180. if (rte_eal_wait_lcore(lcore_id) < 0)
  181. return -1;
  182. }
  183.  
  184. return 0;
  185. }

下面具体分析port初始化过程; 对于每一个port, 首先调用rte_eth_dev_configure配置port的收发包队列个数,并初始化收发包队列控制块;

  1. int
  2. rte_eth_dev_configure(uint8_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
  3. const struct rte_eth_conf *dev_conf)
  4. {
  5. struct rte_eth_dev *dev;
  6. struct rte_eth_dev_info dev_info;
  7. int diag;
  8.  
  9. /* 仅仅能由primary进程初始化 */
  10. /* This function is only safe when called from the primary process
  11. * in a multi-process setup*/
  12. PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);
  13.  
  14. if (port_id >= nb_ports || port_id >= RTE_MAX_ETHPORTS) {
  15. PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
  16. return (-EINVAL);
  17. }
  18. dev = &rte_eth_devices[port_id];
  19.  
  20. /* 在PMD驱动初始化过程中,E1000的ops注冊为eth_em_ops */
  21. FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
  22. FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
  23.  
  24. /* rte_eth_dev_start会把该标记为置为1 */
  25. if (dev->data->dev_started) {
  26. PMD_DEBUG_TRACE(
  27. "port %d must be stopped to allow configuration\n", port_id);
  28. return (-EBUSY);
  29. }
  30.  
  31. /* eth_em_infos_get会返回tx,rx队列数; 本样例max_rx_queues = 1 max_tx_queues = 1 */
  32. /*
  33. * Check that the numbers of RX and TX queues are not greater
  34. * than the maximum number of RX and TX queues supported by the
  35. * configured device.
  36. */
  37. (*dev->dev_ops->dev_infos_get)(dev, &dev_info);
  38. if (nb_rx_q > dev_info.max_rx_queues) {
  39. PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_queues=%d > %d\n",
  40. port_id, nb_rx_q, dev_info.max_rx_queues);
  41. return (-EINVAL);
  42. }
  43. if (nb_rx_q == 0) {
  44. PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_q == 0\n", port_id);
  45. return (-EINVAL);
  46. }
  47.  
  48. if (nb_tx_q > dev_info.max_tx_queues) {
  49. PMD_DEBUG_TRACE("ethdev port_id=%d nb_tx_queues=%d > %d\n",
  50. port_id, nb_tx_q, dev_info.max_tx_queues);
  51. return (-EINVAL);
  52. }
  53. if (nb_tx_q == 0) {
  54. PMD_DEBUG_TRACE("ethdev port_id=%d nb_tx_q == 0\n", port_id);
  55. return (-EINVAL);
  56. }
  57.  
  58. /* dev_conf里面是tx,rx模式的配置 */
  59. /* Copy the dev_conf parameter into the dev structure */
  60. memcpy(&dev->data->dev_conf, dev_conf, sizeof(dev->data->dev_conf));
  61.  
  62. /* 是否收大报文 一般不须要 */
  63. /*
  64. * If jumbo frames are enabled, check that the maximum RX packet
  65. * length is supported by the configured device.
  66. */
  67. if (dev_conf->rxmode.jumbo_frame == 1) {
  68. if (dev_conf->rxmode.max_rx_pkt_len >
  69. dev_info.max_rx_pktlen) {
  70. PMD_DEBUG_TRACE("ethdev port_id=%d max_rx_pkt_len %u"
  71. " > max valid value %u\n",
  72. port_id,
  73. (unsigned)dev_conf->rxmode.max_rx_pkt_len,
  74. (unsigned)dev_info.max_rx_pktlen);
  75. return (-EINVAL);
  76. }
  77. else if (dev_conf->rxmode.max_rx_pkt_len < ETHER_MIN_LEN) {
  78. PMD_DEBUG_TRACE("ethdev port_id=%d max_rx_pkt_len %u"
  79. " < min valid value %u\n",
  80. port_id,
  81. (unsigned)dev_conf->rxmode.max_rx_pkt_len,
  82. (unsigned)ETHER_MIN_LEN);
  83. return (-EINVAL);
  84. }
  85. } else
  86. /* Use default value */
  87. dev->data->dev_conf.rxmode.max_rx_pkt_len = ETHER_MAX_LEN;
  88.  
  89. /* 多队列的检查, 当中各种模式DCB/RSS表示什么意思? */
  90. /* multipe queue mode checking */
  91. diag = rte_eth_dev_check_mq_mode(port_id, nb_rx_q, nb_tx_q, dev_conf);
  92. if (diag != 0) {
  93. PMD_DEBUG_TRACE("port%d rte_eth_dev_check_mq_mode = %d\n",
  94. port_id, diag);
  95. return diag;
  96. }
  97.  
  98. /*
  99. * Setup new number of RX/TX queues and reconfigure device.
  100. */
  101. /* RX队列控制块内存分配 */
  102. diag = rte_eth_dev_rx_queue_config(dev, nb_rx_q);
  103. if (diag != 0) {
  104. PMD_DEBUG_TRACE("port%d rte_eth_dev_rx_queue_config = %d\n",
  105. port_id, diag);
  106. return diag;
  107. }
  108.  
  109. /* TX队列控制块内存分配 */
  110. diag = rte_eth_dev_tx_queue_config(dev, nb_tx_q);
  111. if (diag != 0) {
  112. PMD_DEBUG_TRACE("port%d rte_eth_dev_tx_queue_config = %d\n",
  113. port_id, diag);
  114. rte_eth_dev_rx_queue_config(dev, 0);
  115. return diag;
  116. }
  117.  
  118. /* eth_em_configure, 标记intr->flags |= E1000_FLAG_NEED_LINK_UPDATE; */
  119. diag = (*dev->dev_ops->dev_configure)(dev);
  120. if (diag != 0) {
  121. PMD_DEBUG_TRACE("port%d dev_configure = %d\n",
  122. port_id, diag);
  123. rte_eth_dev_rx_queue_config(dev, 0);
  124. rte_eth_dev_tx_queue_config(dev, 0);
  125. return diag;
  126. }
  127.  
  128. return 0;
  129. }

RX queue setup

  1. int
  2. rte_eth_rx_queue_setup(uint8_t port_id, uint16_t rx_queue_id,
  3. uint16_t nb_rx_desc, unsigned int socket_id,
  4. const struct rte_eth_rxconf *rx_conf,
  5. struct rte_mempool *mp)
  6. {
  7. struct rte_eth_dev *dev;
  8. struct rte_pktmbuf_pool_private *mbp_priv;
  9. struct rte_eth_dev_info dev_info;
  10.  
  11. /* This function is only safe when called from the primary process
  12. * in a multi-process setup*/
  13. PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);
  14.  
  15. if (port_id >= nb_ports) {
  16. PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
  17. return (-EINVAL);
  18. }
  19. dev = &rte_eth_devices[port_id];
  20. if (rx_queue_id >= dev->data->nb_rx_queues) {
  21. PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
  22. return (-EINVAL);
  23. }
  24.  
  25. if (dev->data->dev_started) {
  26. PMD_DEBUG_TRACE(
  27. "port %d must be stopped to allow configuration\n", port_id);
  28. return -EBUSY;
  29. }
  30.  
  31. FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
  32. FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_setup, -ENOTSUP);
  33.  
  34. /*
  35. * Check the size of the mbuf data buffer.
  36. * This value must be provided in the private data of the memory pool.
  37. * First check that the memory pool has a valid private data.
  38. */
  39. (*dev->dev_ops->dev_infos_get)(dev, &dev_info);
  40. if (mp->private_data_size < sizeof(struct rte_pktmbuf_pool_private)) {
  41. PMD_DEBUG_TRACE("%s private_data_size %d < %d\n",
  42. mp->name, (int) mp->private_data_size,
  43. (int) sizeof(struct rte_pktmbuf_pool_private));
  44. return (-ENOSPC);
  45. }
  46.  
  47. /* mbuf data部分大小(2048) > 256 */
  48. mbp_priv = rte_mempool_get_priv(mp);
  49. if ((uint32_t) (mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM) <
  50. dev_info.min_rx_bufsize) {
  51. PMD_DEBUG_TRACE("%s mbuf_data_room_size %d < %d "
  52. "(RTE_PKTMBUF_HEADROOM=%d + min_rx_bufsize(dev)"
  53. "=%d)\n",
  54. mp->name,
  55. (int)mbp_priv->mbuf_data_room_size,
  56. (int)(RTE_PKTMBUF_HEADROOM +
  57. dev_info.min_rx_bufsize),
  58. (int)RTE_PKTMBUF_HEADROOM,
  59. (int)dev_info.min_rx_bufsize);
  60. return (-EINVAL);
  61. }
  62.  
  63. /* eth_em_rx_queue_setup, 初始化收包描写叙述符 */
  64. return (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,
  65. socket_id, rx_conf, mp);
  66. }

  1. int
  2. rte_eth_rx_queue_setup(uint8_t port_id, uint16_t rx_queue_id,
  3. uint16_t nb_rx_desc, unsigned int socket_id,
  4. const struct rte_eth_rxconf *rx_conf,
  5. struct rte_mempool *mp)
  6. {
  7. struct rte_eth_dev *dev;
  8. struct rte_pktmbuf_pool_private *mbp_priv;
  9. struct rte_eth_dev_info dev_info;
  10.  
  11. /* This function is only safe when called from the primary process
  12. * in a multi-process setup*/
  13. PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);
  14.  
  15. if (port_id >= nb_ports) {
  16. PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
  17. return (-EINVAL);
  18. }
  19. dev = &rte_eth_devices[port_id];
  20. if (rx_queue_id >= dev->data->nb_rx_queues) {
  21. PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
  22. return (-EINVAL);
  23. }
  24.  
  25. if (dev->data->dev_started) {
  26. PMD_DEBUG_TRACE(
  27. "port %d must be stopped to allow configuration\n", port_id);
  28. return -EBUSY;
  29. }
  30.  
  31. FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
  32. FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_setup, -ENOTSUP);
  33.  
  34. /*
  35. * Check the size of the mbuf data buffer.
  36. * This value must be provided in the private data of the memory pool.
  37. * First check that the memory pool has a valid private data.
  38. */
  39. (*dev->dev_ops->dev_infos_get)(dev, &dev_info);
  40. if (mp->private_data_size < sizeof(struct rte_pktmbuf_pool_private)) {
  41. PMD_DEBUG_TRACE("%s private_data_size %d < %d\n",
  42. mp->name, (int) mp->private_data_size,
  43. (int) sizeof(struct rte_pktmbuf_pool_private));
  44. return (-ENOSPC);
  45. }
  46.  
  47. /* mbuf data部分大小(2048) > 256 */
  48. mbp_priv = rte_mempool_get_priv(mp);
  49. if ((uint32_t) (mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM) <
  50. dev_info.min_rx_bufsize) {
  51. PMD_DEBUG_TRACE("%s mbuf_data_room_size %d < %d "
  52. "(RTE_PKTMBUF_HEADROOM=%d + min_rx_bufsize(dev)"
  53. "=%d)\n",
  54. mp->name,
  55. (int)mbp_priv->mbuf_data_room_size,
  56. (int)(RTE_PKTMBUF_HEADROOM +
  57. dev_info.min_rx_bufsize),
  58. (int)RTE_PKTMBUF_HEADROOM,
  59. (int)dev_info.min_rx_bufsize);
  60. return (-EINVAL);
  61. }
  62.  
  63. /* eth_em_rx_queue_setup, 初始化收包描写叙述符 */
  64. return (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,
  65. socket_id, rx_conf, mp);
  66. }

TX
queue setup

  1. int
  2. rte_eth_tx_queue_setup(uint8_t port_id, uint16_t tx_queue_id,
  3. uint16_t nb_tx_desc, unsigned int socket_id,
  4. const struct rte_eth_txconf *tx_conf)
  5. {
  6. struct rte_eth_dev *dev;
  7.  
  8. /* This function is only safe when called from the primary process
  9. * in a multi-process setup*/
  10. PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);
  11.  
  12. if (port_id >= RTE_MAX_ETHPORTS || port_id >= nb_ports) {
  13. PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
  14. return (-EINVAL);
  15. }
  16. dev = &rte_eth_devices[port_id];
  17. if (tx_queue_id >= dev->data->nb_tx_queues) {
  18. PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", tx_queue_id);
  19. return (-EINVAL);
  20. }
  21.  
  22. /* 必须在设备启动前做初始化操作 */
  23. if (dev->data->dev_started) {
  24. PMD_DEBUG_TRACE(
  25. "port %d must be stopped to allow configuration\n", port_id);
  26. return -EBUSY;
  27. }
  28.  
  29. /* 调用PMD驱动的tx_queue_setup */
  30. FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_setup, -ENOTSUP);
  31. return (*dev->dev_ops->tx_queue_setup)(dev, tx_queue_id, nb_tx_desc,
  32. socket_id, tx_conf);
  33. }

  1. int
  2. eth_em_tx_queue_setup(struct rte_eth_dev *dev,
  3. uint16_t queue_idx,
  4. uint16_t nb_desc,
  5. unsigned int socket_id,
  6. const struct rte_eth_txconf *tx_conf)
  7. {
  8. const struct rte_memzone *tz;
  9. struct em_tx_queue *txq;
  10. struct e1000_hw *hw;
  11. uint32_t tsize;
  12. uint16_t tx_rs_thresh, tx_free_thresh;
  13.  
  14. hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
  15.  
  16. /* tx descriptor必须是cache line对齐的 */
  17. /*
  18. * Validate number of transmit descriptors.
  19. * It must not exceed hardware maximum, and must be multiple
  20. * of EM_ALIGN.
  21. */
  22. if (((nb_desc * sizeof(*txq->tx_ring)) % EM_ALIGN) != 0 ||
  23. (nb_desc > EM_MAX_RING_DESC) ||
  24. (nb_desc < EM_MIN_RING_DESC)) {
  25. return -(EINVAL);
  26. }
  27.  
  28. /* threshold 配置 */
  29. tx_free_thresh = tx_conf->tx_free_thresh;
  30. if (tx_free_thresh == 0)
  31. tx_free_thresh = (uint16_t)RTE_MIN(nb_desc / 4,
  32. DEFAULT_TX_FREE_THRESH);
  33.  
  34. tx_rs_thresh = tx_conf->tx_rs_thresh;
  35. if (tx_rs_thresh == 0)
  36. tx_rs_thresh = (uint16_t)RTE_MIN(tx_free_thresh,
  37. DEFAULT_TX_RS_THRESH);
  38.  
  39. if (tx_free_thresh >= (nb_desc - 3)) {
  40. RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the "
  41. "number of TX descriptors minus 3. (tx_free_thresh=%u "
  42. "port=%d queue=%d)\n", (unsigned int)tx_free_thresh,
  43. (int)dev->data->port_id, (int)queue_idx);
  44. return -(EINVAL);
  45. }
  46. if (tx_rs_thresh > tx_free_thresh) {
  47. RTE_LOG(ERR, PMD, "tx_rs_thresh must be less than or equal to "
  48. "tx_free_thresh. (tx_free_thresh=%u tx_rs_thresh=%u "
  49. "port=%d queue=%d)\n", (unsigned int)tx_free_thresh,
  50. (unsigned int)tx_rs_thresh, (int)dev->data->port_id,
  51. (int)queue_idx);
  52. return -(EINVAL);
  53. }
  54.  
  55. /*
  56. * If rs_bit_thresh is greater than 1, then TX WTHRESH should be
  57. * set to 0. If WTHRESH is greater than zero, the RS bit is ignored
  58. * by the NIC and all descriptors are written back after the NIC
  59. * accumulates WTHRESH descriptors.
  60. */
  61. if (tx_conf->tx_thresh.wthresh != 0 && tx_rs_thresh != 1) {
  62. RTE_LOG(ERR, PMD, "TX WTHRESH must be set to 0 if "
  63. "tx_rs_thresh is greater than 1. (tx_rs_thresh=%u "
  64. "port=%d queue=%d)\n", (unsigned int)tx_rs_thresh,
  65. (int)dev->data->port_id, (int)queue_idx);
  66. return -(EINVAL);
  67. }
  68.  
  69. /* txq不为空,释放原先的队列中的mbuf和txq */
  70. /* Free memory prior to re-allocation if needed... */
  71. if (dev->data->tx_queues[queue_idx] != NULL) {
  72. em_tx_queue_release(dev->data->tx_queues[queue_idx]);
  73. dev->data->tx_queues[queue_idx] = NULL;
  74. }
  75.  
  76. /* 分配名为rte_em_pmd_tx_ring_p_q的memzone, 用于存放EM_MAX_RING_DESC个tx descriptor */
  77. /*
  78. * Allocate TX ring hardware descriptors. A memzone large enough to
  79. * handle the maximum ring size is allocated in order to allow for
  80. * resizing in later calls to the queue setup function.
  81. */
  82. tsize = sizeof (txq->tx_ring[0]) * EM_MAX_RING_DESC;
  83. if ((tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx, tsize,
  84. socket_id)) == NULL)
  85. return (-ENOMEM);
  86.  
  87. /* txq内存分配 */
  88. /* Allocate the tx queue data structure. */
  89. if ((txq = rte_zmalloc("ethdev TX queue", sizeof(*txq),
  90. CACHE_LINE_SIZE)) == NULL)
  91. return (-ENOMEM);
  92.  
  93. /* txq sw_ring内存分配 */
  94. /* Allocate software ring */
  95. if ((txq->sw_ring = rte_zmalloc("txq->sw_ring",
  96. sizeof(txq->sw_ring[0]) * nb_desc,
  97. CACHE_LINE_SIZE)) == NULL) {
  98. em_tx_queue_release(txq);
  99. return (-ENOMEM);
  100. }
  101.  
  102. txq->nb_tx_desc = nb_desc;
  103. txq->tx_free_thresh = tx_free_thresh;
  104. txq->tx_rs_thresh = tx_rs_thresh;
  105. txq->pthresh = tx_conf->tx_thresh.pthresh;
  106. txq->hthresh = tx_conf->tx_thresh.hthresh;
  107. txq->wthresh = tx_conf->tx_thresh.wthresh;
  108. txq->queue_id = queue_idx;
  109. txq->port_id = dev->data->port_id;
  110.  
  111. txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(queue_idx));
  112.  
  113. /* tx_ring的物理地址 */
  114. #ifndef RTE_LIBRTE_XEN_DOM0
  115. txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
  116. #else
  117. txq->tx_ring_phys_addr = rte_mem_phy2mch(tz->memseg_id, tz->phys_addr);
  118. #endif
  119. /* tx_ring的虚拟地址 */
  120. txq->tx_ring = (struct e1000_data_desc *) tz->addr;
  121.  
  122. PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
  123. txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
  124.  
  125. /* 环状队列初始化,每一个entry的next指向下一个,最后一个指向第一个 */
  126. em_reset_tx_queue(txq);
  127.  
  128. dev->data->tx_queues[queue_idx] = txq;
  129. return (0);
  130. }

port初始化的最后一步是使能port收发包功能,当中主要是通知E1000驱动tx
ring和rx ring的地址, 细节就不再跟进

  1. void
  2. eth_em_tx_init(struct rte_eth_dev *dev)
  3. {
  4. struct e1000_hw *hw;
  5. struct em_tx_queue *txq;
  6. uint32_t tctl;
  7. uint32_t txdctl;
  8. uint16_t i;
  9.  
  10. hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
  11.  
  12. /* 把每个queue的tx ring的物理地址通告给E1000驱动 */
  13. /* Setup the Base and Length of the Tx Descriptor Rings. */
  14. for (i = 0; i < dev->data->nb_tx_queues; i++) {
  15. uint64_t bus_addr;
  16.  
  17. txq = dev->data->tx_queues[i];
  18. bus_addr = txq->tx_ring_phys_addr;
  19. E1000_WRITE_REG(hw, E1000_TDLEN(i),
  20. txq->nb_tx_desc *
  21. sizeof(*txq->tx_ring));
  22. E1000_WRITE_REG(hw, E1000_TDBAH(i),
  23. (uint32_t)(bus_addr >> 32));
  24. E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
  25.  
  26. /* Setup the HW Tx Head and Tail descriptor pointers. */
  27. E1000_WRITE_REG(hw, E1000_TDT(i), 0);
  28. E1000_WRITE_REG(hw, E1000_TDH(i), 0);
  29.  
  30. /* Setup Transmit threshold registers. */
  31. txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
  32. /*
  33. * bit 22 is reserved, on some models should always be 0,
  34. * on others - always 1.
  35. */
  36. txdctl &= E1000_TXDCTL_COUNT_DESC;
  37. txdctl |= txq->pthresh & 0x3F;
  38. txdctl |= (txq->hthresh & 0x3F) << 8;
  39. txdctl |= (txq->wthresh & 0x3F) << 16;
  40. txdctl |= E1000_TXDCTL_GRAN;
  41. E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
  42. }
  43.  
  44. /* Program the Transmit Control Register. */
  45. tctl = E1000_READ_REG(hw, E1000_TCTL);
  46. tctl &= ~E1000_TCTL_CT;
  47. tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
  48. (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
  49.  
  50. /* This write will effectively turn on the transmit unit. */
  51. E1000_WRITE_REG(hw, E1000_TCTL, tctl);
  52. }

  1. int
  2. eth_em_rx_init(struct rte_eth_dev *dev)
  3. {
  4. struct e1000_hw *hw;
  5. struct em_rx_queue *rxq;
  6. uint32_t rctl;
  7. uint32_t rfctl;
  8. uint32_t rxcsum;
  9. uint32_t rctl_bsize;
  10. uint16_t i;
  11. int ret;
  12.  
  13. hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
  14.  
  15. /*
  16. * Make sure receives are disabled while setting
  17. * up the descriptor ring.
  18. */
  19. rctl = E1000_READ_REG(hw, E1000_RCTL);
  20. E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
  21.  
  22. rfctl = E1000_READ_REG(hw, E1000_RFCTL);
  23.  
  24. /* Disable extended descriptor type. */
  25. rfctl &= ~E1000_RFCTL_EXTEN;
  26. /* Disable accelerated acknowledge */
  27. if (hw->mac.type == e1000_82574)
  28. rfctl |= E1000_RFCTL_ACK_DIS;
  29.  
  30. E1000_WRITE_REG(hw, E1000_RFCTL, rfctl);
  31.  
  32. /*
  33. * XXX TEMPORARY WORKAROUND: on some systems with 82573
  34. * long latencies are observed, like Lenovo X60. This
  35. * change eliminates the problem, but since having positive
  36. * values in RDTR is a known source of problems on other
  37. * platforms another solution is being sought.
  38. */
  39. if (hw->mac.type == e1000_82573)
  40. E1000_WRITE_REG(hw, E1000_RDTR, 0x20);
  41.  
  42. dev->rx_pkt_burst = (eth_rx_burst_t)eth_em_recv_pkts;
  43.  
  44. /* 计算pkt buf的大小 */
  45. /* Determine RX bufsize. */
  46. rctl_bsize = EM_MAX_BUF_SIZE;
  47. for (i = 0; i < dev->data->nb_rx_queues; i++) {
  48. struct rte_pktmbuf_pool_private *mbp_priv;
  49. uint32_t buf_size;
  50.  
  51. rxq = dev->data->rx_queues[i];
  52. mbp_priv = rte_mempool_get_priv(rxq->mb_pool);
  53. buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
  54. rctl_bsize = RTE_MIN(rctl_bsize, buf_size);
  55. }
  56.  
  57. rctl |= em_rctl_bsize(hw->mac.type, &rctl_bsize);
  58.  
  59. /* Configure and enable each RX queue. */
  60. for (i = 0; i < dev->data->nb_rx_queues; i++) {
  61. uint64_t bus_addr;
  62. uint32_t rxdctl;
  63.  
  64. rxq = dev->data->rx_queues[i];
  65.  
  66. /* 从mbuf pool中分配mbuf, 填写到rxq->sw_ring,记录每一个pkt buf的物理地址到rxq->rx_ring */
  67. /* Allocate buffers for descriptor rings and setup queue */
  68. ret = em_alloc_rx_queue_mbufs(rxq);
  69. if (ret)
  70. return ret;
  71.  
  72. /* 把rx ring的物理地址通告给E1000驱动 */
  73.  
  74. /*
  75. * Reset crc_len in case it was changed after queue setup by a
  76. * call to configure
  77. */
  78. rxq->crc_len =
  79. (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
  80. : ETHER_CRC_LEN);
  81.  
  82. bus_addr = rxq->rx_ring_phys_addr;
  83. E1000_WRITE_REG(hw, E1000_RDLEN(i),
  84. rxq->nb_rx_desc *
  85. sizeof(*rxq->rx_ring));
  86. E1000_WRITE_REG(hw, E1000_RDBAH(i),
  87. (uint32_t)(bus_addr >> 32));
  88. E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
  89.  
  90. E1000_WRITE_REG(hw, E1000_RDH(i), 0);
  91. E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
  92.  
  93. rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0));
  94. rxdctl &= 0xFE000000;
  95. rxdctl |= rxq->pthresh & 0x3F;
  96. rxdctl |= (rxq->hthresh & 0x3F) << 8;
  97. rxdctl |= (rxq->wthresh & 0x3F) << 16;
  98. rxdctl |= E1000_RXDCTL_GRAN;
  99. E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
  100.  
  101. /* 收大报文用的收包函数 */
  102. /*
  103. * Due to EM devices not having any sort of hardware
  104. * limit for packet length, jumbo frame of any size
  105. * can be accepted, thus we have to enable scattered
  106. * rx if jumbo frames are enabled (or if buffer size
  107. * is too small to accomodate non-jumbo packets)
  108. * to avoid splitting packets that don't fit into
  109. * one buffer.
  110. */
  111. if (dev->data->dev_conf.rxmode.jumbo_frame ||
  112. rctl_bsize < ETHER_MAX_LEN) {
  113. dev->rx_pkt_burst =
  114. (eth_rx_burst_t)eth_em_recv_scattered_pkts;
  115. dev->data->scattered_rx = 1;
  116. }
  117. }
  118.  
  119. /* 下面省略 */
  120. ...
  121.  
  122. return 0;
  123. }

到此port初始化完毕,比启动,回到main函数中, 在每一个lcore上启动循环收包函数

  1. /* launch per-lcore init on every lcore */
  2. rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);

lcore的主线程处理例如以下

  1. /* main processing loop */
  2. static void
  3. l2fwd_main_loop(void)
  4. {
  5. struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
  6. struct rte_mbuf *m;
  7. unsigned lcore_id;
  8. uint64_t prev_tsc, diff_tsc, cur_tsc, timer_tsc;
  9. unsigned i, j, portid, nb_rx;
  10. struct lcore_queue_conf *qconf;
  11. const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
  12.  
  13. prev_tsc = 0;
  14. timer_tsc = 0;
  15.  
  16. lcore_id = rte_lcore_id();
  17. qconf = &lcore_queue_conf[lcore_id];
  18.  
  19. if (qconf->n_rx_port == 0) {
  20. RTE_LOG(INFO, L2FWD, "lcore %u has nothing to do\n", lcore_id);
  21. return;
  22. }
  23.  
  24. RTE_LOG(INFO, L2FWD, "entering main loop on lcore %u\n", lcore_id);
  25.  
  26. /* 当前lcore须要处理哪些port(queue) */
  27. for (i = 0; i < qconf->n_rx_port; i++) {
  28.  
  29. portid = qconf->rx_port_list[i];
  30. RTE_LOG(INFO, L2FWD, " -- lcoreid=%u portid=%u\n", lcore_id,
  31. portid);
  32. }
  33.  
  34. while (1) {
  35.  
  36. cur_tsc = rte_rdtsc();
  37.  
  38. /*
  39. * TX burst queue drain
  40. */
  41. diff_tsc = cur_tsc - prev_tsc;
  42.  
  43. /* 隔一段时间才把全部要发送的报文发送出去并打印统计信息 */
  44. if (unlikely(diff_tsc > drain_tsc)) {
  45.  
  46. for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++) {
  47. /* 当前port没有须要发送的报文 */
  48. if (qconf->tx_mbufs[portid].len == 0)
  49. continue;
  50.  
  51. /* 调用device的发包函数并统计发送的报文个数 */
  52. l2fwd_send_burst(&lcore_queue_conf[lcore_id],
  53. qconf->tx_mbufs[portid].len,
  54. (uint8_t) portid);
  55.  
  56. /* 到此应该当前端口须要发送的报文全部发送,因此len置为0 */
  57. qconf->tx_mbufs[portid].len = 0;
  58. }
  59.  
  60. /* if timer is enabled */
  61. if (timer_period > 0) {
  62.  
  63. /* advance the timer */
  64. timer_tsc += diff_tsc;
  65.  
  66. /* if timer has reached its timeout */
  67. if (unlikely(timer_tsc >= (uint64_t) timer_period)) {
  68.  
  69. /* do this only on master core */
  70. if (lcore_id == rte_get_master_lcore()) {
  71. print_stats();
  72. /* reset the timer */
  73. timer_tsc = 0;
  74. }
  75. }
  76. }
  77.  
  78. prev_tsc = cur_tsc;
  79. }
  80.  
  81. /* 当前lcore须要处理的queue */
  82. /*
  83. * Read packet from RX queues
  84. */
  85. for (i = 0; i < qconf->n_rx_port; i++) {
  86.  
  87. portid = qconf->rx_port_list[i];
  88.  
  89. /* 当前port仅仅有queue0 */
  90. nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
  91. pkts_burst, MAX_PKT_BURST);
  92.  
  93. /* 更新收包统计 */
  94. port_statistics[portid].rx += nb_rx;
  95.  
  96. /* 把全部收上来的报文改动目的MAC后增加到发包队列 */
  97. for (j = 0; j < nb_rx; j++) {
  98. m = pkts_burst[j];
  99.  
  100. /* PKT DATA部分加载cache,这个好像收包部分已经prefetch过了 */
  101. rte_prefetch0(rte_pktmbuf_mtod(m, void *));
  102.  
  103. /* forword */
  104. l2fwd_simple_forward(m, portid);
  105. }
  106. }
  107. }
  108. }

首先看报文是怎样收上来的,
调用device的rx_pkt_burst

  1. static inline uint16_t
  2. rte_eth_rx_burst(uint8_t port_id, uint16_t queue_id,
  3. struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
  4. {
  5. struct rte_eth_dev *dev;
  6.  
  7. dev = &rte_eth_devices[port_id];
  8. return (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id], rx_pkts, nb_pkts);
  9. }

PMD的收包函数例如以下:

  1. uint16_t
  2. eth_em_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
  3. uint16_t nb_pkts)
  4. {
  5. /* volatile防止编译器优化,每次使用必须又一次从memory中取而不是用寄存器的值 */
  6. volatile struct e1000_rx_desc *rx_ring;
  7. volatile struct e1000_rx_desc *rxdp;
  8. struct em_rx_queue *rxq;
  9. struct em_rx_entry *sw_ring;
  10. struct em_rx_entry *rxe;
  11. struct rte_mbuf *rxm;
  12. struct rte_mbuf *nmb;
  13. struct e1000_rx_desc rxd;
  14. uint64_t dma_addr;
  15. uint16_t pkt_len;
  16. uint16_t rx_id;
  17. uint16_t nb_rx;
  18. uint16_t nb_hold;
  19. uint8_t status;
  20.  
  21. rxq = rx_queue;
  22.  
  23. nb_rx = 0;
  24. nb_hold = 0;
  25. rx_id = rxq->rx_tail; /* 当前收包位置 */
  26. rx_ring = rxq->rx_ring; /* rx descriptor */
  27. sw_ring = rxq->sw_ring; /* mbuf */
  28.  
  29. /* 一次性收32个报文 */
  30. while (nb_rx < nb_pkts) {
  31. /*
  32. * The order of operations here is important as the DD status
  33. * bit must not be read after any other descriptor fields.
  34. * rx_ring and rxdp are pointing to volatile data so the order
  35. * of accesses cannot be reordered by the compiler. If they were
  36. * not volatile, they could be reordered which could lead to
  37. * using invalid descriptor fields when read from rxd.
  38. */
  39.  
  40. /* 当前报文的descriptor */
  41. rxdp = &rx_ring[rx_id];
  42.  
  43. /* 结束标记,必须首先读取 */
  44. status = rxdp->status;
  45. if (! (status & E1000_RXD_STAT_DD))
  46. break;
  47.  
  48. /* 复制一份 */
  49. rxd = *rxdp;
  50.  
  51. /*
  52. * End of packet.
  53. *
  54. * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
  55. * likely to be invalid and to be dropped by the various
  56. * validation checks performed by the network stack.
  57. *
  58. * Allocate a new mbuf to replenish the RX ring descriptor.
  59. * If the allocation fails:
  60. * - arrange for that RX descriptor to be the first one
  61. * being parsed the next time the receive function is
  62. * invoked [on the same queue].
  63. *
  64. * - Stop parsing the RX ring and return immediately.
  65. *
  66. * This policy do not drop the packet received in the RX
  67. * descriptor for which the allocation of a new mbuf failed.
  68. * Thus, it allows that packet to be later retrieved if
  69. * mbuf have been freed in the mean time.
  70. * As a side effect, holding RX descriptors instead of
  71. * systematically giving them back to the NIC may lead to
  72. * RX ring exhaustion situations.
  73. * However, the NIC can gracefully prevent such situations
  74. * to happen by sending specific "back-pressure" flow control
  75. * frames to its peer(s).
  76. */
  77. PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
  78. "status=0x%x pkt_len=%u\n",
  79. (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
  80. (unsigned) rx_id, (unsigned) status,
  81. (unsigned) rte_le_to_cpu_16(rxd.length));
  82.  
  83. /* 分配新的mbuf给驱动 */
  84. nmb = rte_rxmbuf_alloc(rxq->mb_pool);
  85. if (nmb == NULL) {
  86. PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
  87. "queue_id=%u\n",
  88. (unsigned) rxq->port_id,
  89. (unsigned) rxq->queue_id);
  90. rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
  91. break;
  92. }
  93.  
  94. /* 表示当前descriptor被上层软件占用 */
  95. nb_hold++;
  96.  
  97. /* 当前收到的mbuf */
  98. rxe = &sw_ring[rx_id];
  99.  
  100. /* 收包位置,假设超过环状数组则回滚 */
  101. rx_id++;
  102. if (rx_id == rxq->nb_rx_desc)
  103. rx_id = 0;
  104.  
  105. /* mbuf加载cache下次循环使用 */
  106. /* Prefetch next mbuf while processing current one. */
  107. rte_em_prefetch(sw_ring[rx_id].mbuf);
  108.  
  109. /* 取下一个descriptor,以及mbuf指针下次循环使用 */
  110. /* 一个cache line是4个descriptor大小(64字节) */
  111. /*
  112. * When next RX descriptor is on a cache-line boundary,
  113. * prefetch the next 4 RX descriptors and the next 8 pointers
  114. * to mbufs.
  115. */
  116. if ((rx_id & 0x3) == 0) {
  117. rte_em_prefetch(&rx_ring[rx_id]);
  118. rte_em_prefetch(&sw_ring[rx_id]);
  119. }
  120.  
  121. /* Rearm RXD: attach new mbuf and reset status to zero. */
  122.  
  123. /* 替换sw_ring entry的mbuf指针 */
  124. rxm = rxe->mbuf;
  125. rxe->mbuf = nmb;
  126. dma_addr =
  127. rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
  128. rxdp->buffer_addr = dma_addr;
  129.  
  130. /* 重置当前descriptor的status */
  131. rxdp->status = 0;
  132.  
  133. /*
  134. * Initialize the returned mbuf.
  135. * 1) setup generic mbuf fields:
  136. * - number of segments,
  137. * - next segment,
  138. * - packet length,
  139. * - RX port identifier.
  140. * 2) integrate hardware offload data, if any:
  141. * - RSS flag & hash,
  142. * - IP checksum flag,
  143. * - VLAN TCI, if any,
  144. * - error flags.
  145. */
  146. pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.length) -
  147. rxq->crc_len);
  148. rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
  149. rte_packet_prefetch(rxm->pkt.data);
  150. rxm->pkt.nb_segs = 1;
  151. rxm->pkt.next = NULL;
  152. rxm->pkt.pkt_len = pkt_len;
  153. rxm->pkt.data_len = pkt_len;
  154. rxm->pkt.in_port = rxq->port_id;
  155.  
  156. rxm->ol_flags = rx_desc_status_to_pkt_flags(status);
  157. rxm->ol_flags = (uint16_t)(rxm->ol_flags |
  158. rx_desc_error_to_pkt_flags(rxd.errors));
  159.  
  160. /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
  161. rxm->pkt.vlan_macip.f.vlan_tci = rte_le_to_cpu_16(rxd.special);
  162.  
  163. /* 把收到的mbuf返回给用户 */
  164. /*
  165. * Store the mbuf address into the next entry of the array
  166. * of returned packets.
  167. */
  168. rx_pkts[nb_rx++] = rxm;
  169. }
  170.  
  171. /* 收包位置更新 */
  172. rxq->rx_tail = rx_id;
  173.  
  174. /* 更新被上层软件使用的descriptor个数 */
  175. /*
  176. * If the number of free RX descriptors is greater than the RX free
  177. * threshold of the queue, advance the Receive Descriptor Tail (RDT)
  178. * register.
  179. * Update the RDT with the value of the last processed RX descriptor
  180. * minus 1, to guarantee that the RDT register is never equal to the
  181. * RDH register, which creates a "full" ring situtation from the
  182. * hardware point of view...
  183. */
  184. nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
  185. if (nb_hold > rxq->rx_free_thresh) {
  186. PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
  187. "nb_hold=%u nb_rx=%u\n",
  188. (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
  189. (unsigned) rx_id, (unsigned) nb_hold,
  190. (unsigned) nb_rx);
  191. rx_id = (uint16_t) ((rx_id == 0) ?
  192.  
  193. (rxq->nb_rx_desc - 1) : (rx_id - 1));
  194. E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
  195. nb_hold = 0;
  196. }
  197. rxq->nb_rx_hold = nb_hold;
  198. return (nb_rx);
  199. }

发包函数

  1. static inline uint16_t
  2. rte_eth_tx_burst(uint8_t port_id, uint16_t queue_id,
  3. struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
  4. {
  5. struct rte_eth_dev *dev;
  6.  
  7. dev = &rte_eth_devices[port_id];
  8. return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
  9. }

调用的PMD的发包函数

  1. uint16_t
  2. eth_em_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
  3. uint16_t nb_pkts)
  4. {
  5. struct em_tx_queue *txq;
  6. struct em_tx_entry *sw_ring;
  7. struct em_tx_entry *txe, *txn;
  8. volatile struct e1000_data_desc *txr;
  9. volatile struct e1000_data_desc *txd;
  10. struct rte_mbuf *tx_pkt;
  11. struct rte_mbuf *m_seg;
  12. uint64_t buf_dma_addr;
  13. uint32_t popts_spec;
  14. uint32_t cmd_type_len;
  15. uint16_t slen;
  16. uint16_t ol_flags;
  17. uint16_t tx_id;
  18. uint16_t tx_last;
  19. uint16_t nb_tx;
  20. uint16_t nb_used;
  21. uint16_t tx_ol_req;
  22. uint32_t ctx;
  23. uint32_t new_ctx;
  24. union rte_vlan_macip hdrlen;
  25.  
  26. txq = tx_queue;
  27. sw_ring = txq->sw_ring;
  28. txr = txq->tx_ring;
  29. /* 发包位置 */
  30. tx_id = txq->tx_tail;
  31. /* 先把旧的已发送的mbuf回收,然后把新的要发送的mbuf写入 */
  32. txe = &sw_ring[tx_id];
  33.  
  34. /* 可用tx descriptor太少的话做cleanup */
  35. /* Determine if the descriptor ring needs to be cleaned. */
  36. if ((txq->nb_tx_desc - txq->nb_tx_free) > txq->tx_free_thresh) {
  37. em_xmit_cleanup(txq);
  38. }
  39.  
  40. /* nb_pkts为一共要发送的报文个数(32) */
  41. /* TX loop */
  42. for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
  43. new_ctx = 0;
  44.  
  45. /* 要发送的mbuf指针 */
  46. tx_pkt = *tx_pkts++;
  47.  
  48. /* 加载L1,L2 cache,用于释放mbuf */
  49. RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
  50.  
  51. /*
  52. * Determine how many (if any) context descriptors
  53. * are needed for offload functionality.
  54. */
  55. ol_flags = tx_pkt->ol_flags;
  56.  
  57. /* If hardware offload required */
  58. tx_ol_req = (uint16_t)(ol_flags & (PKT_TX_IP_CKSUM |
  59. PKT_TX_L4_MASK));
  60. if (tx_ol_req) {
  61. hdrlen = tx_pkt->pkt.vlan_macip;
  62. /* 检查是否须要新的context descriptor */
  63. /* If new context to be built or reuse the exist ctx. */
  64. ctx = what_ctx_update(txq, tx_ol_req, hdrlen);
  65.  
  66. /* Only allocate context descriptor if required*/
  67. new_ctx = (ctx == EM_CTX_NUM);
  68. }
  69.  
  70. /* 须要的descriptor个数为报文的segment数+是否须要context descriptor */
  71. /*
  72. * Keep track of how many descriptors are used this loop
  73. * This will always be the number of segments + the number of
  74. * Context descriptors required to transmit the packet
  75. */
  76. nb_used = (uint16_t)(tx_pkt->pkt.nb_segs + new_ctx);
  77.  
  78. /* 结束位置, 从tx_id处用起,因此-1 */
  79. /*
  80. * The number of descriptors that must be allocated for a
  81. * packet is the number of segments of that packet, plus 1
  82. * Context Descriptor for the hardware offload, if any.
  83. * Determine the last TX descriptor to allocate in the TX ring
  84. * for the packet, starting from the current position (tx_id)
  85. * in the ring.
  86. */
  87. tx_last = (uint16_t) (tx_id + nb_used - 1);
  88.  
  89. /* 回滚 */
  90. /* Circular ring */
  91. if (tx_last >= txq->nb_tx_desc)
  92. tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
  93.  
  94. PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
  95. " tx_first=%u tx_last=%u\n",
  96. (unsigned) txq->port_id,
  97. (unsigned) txq->queue_id,
  98. (unsigned) tx_pkt->pkt.pkt_len,
  99. (unsigned) tx_id,
  100. (unsigned) tx_last);
  101.  
  102. /*
  103. * Make sure there are enough TX descriptors available to
  104. * transmit the entire packet.
  105. * nb_used better be less than or equal to txq->tx_rs_thresh
  106. */
  107. while (unlikely (nb_used > txq->nb_tx_free)) {
  108. PMD_TX_FREE_LOG(DEBUG,
  109. "Not enough free TX descriptors "
  110. "nb_used=%4u nb_free=%4u "
  111. "(port=%d queue=%d)",
  112. nb_used, txq->nb_tx_free,
  113. txq->port_id, txq->queue_id);
  114.  
  115. if (em_xmit_cleanup(txq) != 0) {
  116. /* Could not clean any descriptors */
  117. if (nb_tx == 0)
  118. return (0);
  119. goto end_of_tx;
  120. }
  121. }
  122.  
  123. /*
  124. * By now there are enough free TX descriptors to transmit
  125. * the packet.
  126. */
  127.  
  128. /*
  129. * Set common flags of all TX Data Descriptors.
  130. *
  131. * The following bits must be set in all Data Descriptors:
  132. * - E1000_TXD_DTYP_DATA
  133. * - E1000_TXD_DTYP_DEXT
  134. *
  135. * The following bits must be set in the first Data Descriptor
  136. * and are ignored in the other ones:
  137. * - E1000_TXD_POPTS_IXSM
  138. * - E1000_TXD_POPTS_TXSM
  139. *
  140. * The following bits must be set in the last Data Descriptor
  141. * and are ignored in the other ones:
  142. * - E1000_TXD_CMD_VLE
  143. * - E1000_TXD_CMD_IFCS
  144. *
  145. * The following bits must only be set in the last Data
  146. * Descriptor:
  147. * - E1000_TXD_CMD_EOP
  148. *
  149. * The following bits can be set in any Data Descriptor, but
  150. * are only set in the last Data Descriptor:
  151. * - E1000_TXD_CMD_RS
  152. */
  153. cmd_type_len = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
  154. E1000_TXD_CMD_IFCS;
  155. popts_spec = 0;
  156.  
  157. /* Set VLAN Tag offload fields. */
  158. if (ol_flags & PKT_TX_VLAN_PKT) {
  159. cmd_type_len |= E1000_TXD_CMD_VLE;
  160. popts_spec = tx_pkt->pkt.vlan_macip.f.vlan_tci <<
  161. E1000_TXD_VLAN_SHIFT;
  162. }
  163.  
  164. if (tx_ol_req) {
  165. /*
  166. * Setup the TX Context Descriptor if required
  167. */
  168. if (new_ctx) {
  169. volatile struct e1000_context_desc *ctx_txd;
  170.  
  171. /* 假设须要context descriptor, tx_id处存放ctx的tx descriptor */
  172. ctx_txd = (volatile struct e1000_context_desc *)
  173. &txr[tx_id];
  174.  
  175. /* 下一个tx descriptor */
  176. txn = &sw_ring[txe->next_id];
  177. RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
  178.  
  179. if (txe->mbuf != NULL) {
  180. rte_pktmbuf_free_seg(txe->mbuf);
  181. txe->mbuf = NULL;
  182. }
  183.  
  184. /* 设置ctx值到txq */
  185. em_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
  186. hdrlen);
  187.  
  188. txe->last_id = tx_last;
  189.  
  190. /* tx_id,txe 都分别指向下一个 */
  191. tx_id = txe->next_id;
  192. txe = txn;
  193. }
  194.  
  195. /*
  196. * Setup the TX Data Descriptor,
  197. * This path will go through
  198. * whatever new/reuse the context descriptor
  199. */
  200. popts_spec |= tx_desc_cksum_flags_to_upper(ol_flags);
  201. }
  202.  
  203. m_seg = tx_pkt;
  204. do {
  205. txd = &txr[tx_id];
  206. txn = &sw_ring[txe->next_id];
  207.  
  208. /* 已发送的mbuf,回收,实际的pkt addr已经写入tx descriptor了,mbuf已经没用了 */
  209. if (txe->mbuf != NULL)
  210. rte_pktmbuf_free_seg(txe->mbuf);
  211.  
  212. /* 当前mbuf增加txe */
  213. txe->mbuf = m_seg;
  214.  
  215. /*
  216. * Set up Transmit Data Descriptor.
  217. */
  218. slen = m_seg->pkt.data_len;
  219. buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);
  220.  
  221. txd->buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
  222. txd->lower.data = rte_cpu_to_le_32(cmd_type_len | slen);
  223. txd->upper.data = rte_cpu_to_le_32(popts_spec);
  224.  
  225. txe->last_id = tx_last;
  226.  
  227. /* tx_id更新 */
  228. tx_id = txe->next_id;
  229. txe = txn;
  230. m_seg = m_seg->pkt.next;
  231. } while (m_seg != NULL);
  232.  
  233. /* 驱动相关的flag,vlan ip checksum之类,略过 */
  234. /*
  235. * The last packet data descriptor needs End Of Packet (EOP)
  236. */
  237. cmd_type_len |= E1000_TXD_CMD_EOP;
  238. txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
  239. txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);
  240.  
  241. /* Set RS bit only on threshold packets' last descriptor */
  242. if (txq->nb_tx_used >= txq->tx_rs_thresh) {
  243. PMD_TX_FREE_LOG(DEBUG,
  244. "Setting RS bit on TXD id="
  245. "%4u (port=%d queue=%d)",
  246. tx_last, txq->port_id, txq->queue_id);
  247.  
  248. cmd_type_len |= E1000_TXD_CMD_RS;
  249.  
  250. /* Update txq RS bit counters */
  251. txq->nb_tx_used = 0;
  252. }
  253. txd->lower.data |= rte_cpu_to_le_32(cmd_type_len);
  254. }
  255. end_of_tx:
  256. rte_wmb();
  257.  
  258. /* 通知驱动有报文发送 */
  259. /*
  260. * Set the Transmit Descriptor Tail (TDT)
  261. */
  262. PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
  263. (unsigned) txq->port_id, (unsigned) txq->queue_id,
  264. (unsigned) tx_id, (unsigned) nb_tx);
  265. E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
  266.  
  267. /* 更新tx队列位置 */
  268. txq->tx_tail = tx_id;
  269.  
  270. return (nb_tx);
  271. }

dpdk l2fwd 应用流程分析的更多相关文章

  1. 8、Struts2 运行流程分析

    1.流程分析: 请求发送给 StrutsPrepareAndExecuteFilter StrutsPrepareAndExecuteFilter 询问 ActionMapper: 该请求是否是一个 ...

  2. freeswitch呼叫流程分析

    今天翻文档时发现之前整理的关于freeswitch呼叫相关的内容,写成博文分享出来也方便我以后查阅. 整体结构图 FreeswitchCore 模块加载过程 freeswitch主程序初始化时会从mo ...

  3. u-boot 流程分析

    u-boot 介绍: 对于计算机来说 , 从一开始上机通电是无法直接启动操作系统的 , 这中间需要一个引导过程 , 嵌入式Linux系统同样离不开引导程序 ,  这个启动程序就叫启动加载程序(Boot ...

  4. thttpd和cgilua安装与运行流程分析

    安装 参考如下博文安装thttpd软件 http://blog.csdn.net/21aspnet/article/details/7045845 http://blog.csdn.net/drago ...

  5. 【转】Hostapd工作流程分析

    [转]Hostapd工作流程分析 转自:http://blog.chinaunix.net/uid-30081165-id-5290531.html Hostapd是一个运行在用户态的守护进程,可以通 ...

  6. u-boot中nandflash初始化流程分析(转)

    u-boot中nandflash初始化流程分析(转) 原文地址http://zhuairlunjj.blog.163.com/blog/static/80050945201092011249136/ ...

  7. Android7.0 Phone应用源码分析(二) phone来电流程分析

    接上篇博文:Android7.0 Phone应用源码分析(一) phone拨号流程分析 今天我们再来分析下Android7.0 的phone的来电流程 1.1TelephonyFramework 当有 ...

  8. runc start container流程分析

    1.runc/start.go Action: func(context *cli.Context) error 该函数首先调用container, err := getContainer(conte ...

  9. 从注册流程 分析如何安全退出多个Activity 多种方式(附DEMO)

      退出Activity注册Android遍历   目录(?)[+] 前言 知识结构 具体方案 方案1 方法采用FLAG_ACTIVITY_CLEAR_TOP退出整个程序多activity 方案2 方 ...

随机推荐

  1. CSS中alt和title属性的正确使用

    1.在<img>标签中的使用 alt:全称为alttext,实质是当图片无法正确显示时用于替换(在IE下同时起到了title的作用,即鼠标滑过时文字提示): title:鼠标经过时文字提示 ...

  2. HTTP_PROXY

    Linux, macOS, or Unix: $ export HTTP_PROXY=http://a.b.c.d:n $ export HTTPS_PROXY=http://w.x.y.z:m 设置 ...

  3. UWP 开发APP准备工作

    每新建一个UWP之后,都要进行一番相同的处理步骤,才能使Mobile的使用体验更好,故总结如下: 1.订阅Mobile后退导航事件 在App.xaml.cs文件中OnLaunched方法中添加 Sys ...

  4. C#数组大小分析(附测试过程中想起的debug和release区别)

    C#数组的理论最大长度到底是多少呢?曾经一度问过度娘,谷歌,貌似都没有得出一个比较准确的答案,无外乎是什么Int32的最大值啊什么的,今天终于决定写个软件来自己测试一下,在几台不同的电脑里面实际测试看 ...

  5. js禁止

    很多时候需要用到js禁止相关的代码: function prohibit() { // 禁止右键 $(document).ready(function() { $(document).bind(&qu ...

  6. ZBrush中Mrgb、Rgb和M的使用

    ZBrush®软件工具架中所有的命令选项都是我们平时较为常用的,位于工具架的中间位置,有一个Mrgb.Rgb和M选项它们所表示的是材质及颜色,那么,在Zbrush中只要选择相应的选项,就可以同时绘制材 ...

  7. LeetCode Golang 8. 字符串转换整数 (atoi)

    8. 字符串转换整数 (atoi) 首先,该函数会根据需要丢弃无用的开头空格字符,直到寻找到第一个非空格的字符为止. 当我们寻找到的第一个非空字符为正或者负号时,则将该符号与之后面尽可能多的连续数字组 ...

  8. Python_study_day_1_while_if

    1.什么是计算机 cpu:大脑 3GHz 内存:缓冲硬盘和cpu,提高计算机运算速度 硬盘:存储数据 2.编程语言的简单分类 编译型,解释型,混合型 3.python是什么编程语言 解释类语言 //. ...

  9. elementUI 上传.csv文件不成功 导入功能

    前言:element上传excel文件   导入功能 目标:点击导入,将excel表格的数据填充到表格. <el-upload class="upload-demo" :ac ...

  10. Numpy的使用规则

    之前安装的python版本是3.7 各种库都是自己一个一个下载安装的 很操心 各种缺功能 后来发现了anaconda 啊 真是一个好东西 简单来说 它就是一个涵盖大部分常用库的python包 一次安装 ...