bgpd: Modify bgp to handle packet events in a FIFO

Current behavor of BGP is to have a event per connection.  Given
that on startup of BGP with a high number of neighbors you end
up with 2 * # of peers events that are being processed.  Additionally
once BGP has selected the connection this still only comes down
to 512 events.  This number of events is swamping the event system
and in addition delaying any other work from being done in BGP at
all because the the 512 events are always going to take precedence
over everything else.  The other main events are the handling
of the metaQ(1 event), update group events( 1 per update group )
and the zebra batching event.  These are being swamped.

Modify the BGP code to have a FIFO of connections.  As new data
comes in to read, place the connection on the end of the FIFO.
Have the bgp_process_packet handle up to 100 packets spread
across the individual peers where each peer/connection is limited
to the original quanta.  During testing I noticed that withdrawal
events at very very large scale are taking up to 40 seconds to process
so I added a check for yielding to further limit the number of packets
being processed.

This change also allow for BGP to be interactive again on scale
setups on initial convergence.  Prior to this change any vtysh
command entered would be delayed by 10's of seconds in my setup
while BGP was doing other work.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
This commit is contained in:
Donald Sharp 2025-03-21 07:48:50 -04:00
parent f3790640d3
commit 12bf042c68
7 changed files with 159 additions and 30 deletions

View file

@ -184,7 +184,11 @@ static struct peer *peer_xfer_conn(struct peer *from_peer)
EVENT_OFF(keeper->t_delayopen);
EVENT_OFF(keeper->t_connect_check_r);
EVENT_OFF(keeper->t_connect_check_w);
EVENT_OFF(keeper->t_process_packet);
frr_with_mutex (&bm->peer_connection_mtx) {
if (peer_connection_fifo_member(&bm->connection_fifo, keeper))
peer_connection_fifo_del(&bm->connection_fifo, keeper);
}
/*
* At this point in time, it is possible that there are packets pending
@ -305,8 +309,13 @@ static struct peer *peer_xfer_conn(struct peer *from_peer)
bgp_reads_on(keeper);
bgp_writes_on(keeper);
event_add_event(bm->master, bgp_process_packet, keeper, 0,
&keeper->t_process_packet);
frr_with_mutex (&bm->peer_connection_mtx) {
if (!peer_connection_fifo_member(&bm->connection_fifo, keeper)) {
peer_connection_fifo_add_tail(&bm->connection_fifo, keeper);
}
}
event_add_event(bm->master, bgp_process_packet, NULL, 0, &bm->e_process_packet);
return (peer);
}

View file

@ -99,7 +99,11 @@ void bgp_reads_off(struct peer_connection *connection)
assert(fpt->running);
event_cancel_async(fpt->master, &connection->t_read, NULL);
EVENT_OFF(connection->t_process_packet);
frr_with_mutex (&bm->peer_connection_mtx) {
if (peer_connection_fifo_member(&bm->connection_fifo, connection))
peer_connection_fifo_del(&bm->connection_fifo, connection);
}
UNSET_FLAG(connection->thread_flags, PEER_THREAD_READS_ON);
}
@ -292,9 +296,13 @@ done:
event_add_read(fpt->master, bgp_process_reads, connection,
connection->fd, &connection->t_read);
if (added_pkt)
event_add_event(bm->master, bgp_process_packet, connection, 0,
&connection->t_process_packet);
if (added_pkt) {
frr_with_mutex (&bm->peer_connection_mtx) {
if (!peer_connection_fifo_member(&bm->connection_fifo, connection))
peer_connection_fifo_add_tail(&bm->connection_fifo, connection);
}
event_add_event(bm->master, bgp_process_packet, NULL, 0, &bm->e_process_packet);
}
}
/*

View file

@ -10,6 +10,7 @@
#define BGP_WRITE_PACKET_MAX 64U
#define BGP_READ_PACKET_MAX 10U
#define BGP_PACKET_PROCESS_LIMIT 100
#include "bgpd/bgpd.h"
#include "frr_pthread.h"

View file

@ -161,6 +161,14 @@ __attribute__((__noreturn__)) void sigint(void)
bgp_exit(0);
/*
* This is being done after bgp_exit because items may be removed
* from the connection_fifo
*/
peer_connection_fifo_fini(&bm->connection_fifo);
EVENT_OFF(bm->e_process_packet);
pthread_mutex_destroy(&bm->peer_connection_mtx);
exit(0);
}

View file

@ -3967,6 +3967,18 @@ int bgp_capability_receive(struct peer_connection *connection,
* would not, making event flow difficult to understand. Please think twice
* before hacking this.
*
* packet_processing is now a FIFO of connections that need to be handled
* This loop has a maximum run of 100(BGP_PACKET_PROCESS_LIMIT) packets,
* but each individual connection can only handle the quanta value as
* specified in bgp_vty.c. If the connection still has work to do, place it
* back on the back of the queue for more work. Do note that event_should_yield
* is also being called to figure out if processing should stop and work
* picked up after other items can run. This was added *After* withdrawals
* started being processed at scale and this function was taking cpu for 40+ seconds
* On my machine we are getting 2-3 packets before a yield should happen in the
* update case. Withdrawal is 1 packet being processed(note this is a very very
* fast computer) before other items should be run.
*
* Thread type: EVENT_EVENT
* @param thread
* @return 0
@ -3979,30 +3991,54 @@ void bgp_process_packet(struct event *thread)
uint32_t rpkt_quanta_old; // how many packets to read
int fsm_update_result; // return code of bgp_event_update()
int mprc; // message processing return code
uint32_t processed = 0, curr_connection_processed = 0;
bool more_work = false;
size_t count;
uint32_t total_packets_to_process, total_processed = 0;
connection = EVENT_ARG(thread);
frr_with_mutex (&bm->peer_connection_mtx)
connection = peer_connection_fifo_pop(&bm->connection_fifo);
if (!connection)
goto done;
total_packets_to_process = BGP_PACKET_PROCESS_LIMIT;
peer = connection->peer;
rpkt_quanta_old = atomic_load_explicit(&peer->bgp->rpkt_quanta,
memory_order_relaxed);
fsm_update_result = 0;
/* Guard against scheduled events that occur after peer deletion. */
if (connection->status == Deleted || connection->status == Clearing)
return;
while ((processed < total_packets_to_process) && connection) {
total_processed++;
/* Guard against scheduled events that occur after peer deletion. */
if (connection->status == Deleted || connection->status == Clearing) {
frr_with_mutex (&bm->peer_connection_mtx)
connection = peer_connection_fifo_pop(&bm->connection_fifo);
unsigned int processed = 0;
if (connection)
peer = connection->peer;
continue;
}
while (processed < rpkt_quanta_old) {
uint8_t type = 0;
bgp_size_t size;
char notify_data_length[2];
frr_with_mutex (&connection->io_mtx) {
frr_with_mutex (&connection->io_mtx)
peer->curr = stream_fifo_pop(connection->ibuf);
}
if (peer->curr == NULL) // no packets to process, hmm...
return;
if (peer->curr == NULL) {
frr_with_mutex (&bm->peer_connection_mtx)
connection = peer_connection_fifo_pop(&bm->connection_fifo);
if (connection)
peer = connection->peer;
continue;
}
/* skip the marker and copy the packet length */
stream_forward_getp(peer->curr, BGP_MARKER_SIZE);
@ -4106,32 +4142,81 @@ void bgp_process_packet(struct event *thread)
stream_free(peer->curr);
peer->curr = NULL;
processed++;
curr_connection_processed++;
/* Update FSM */
if (mprc != BGP_PACKET_NOOP)
fsm_update_result = bgp_event_update(connection, mprc);
else
continue;
/*
* If peer was deleted, do not process any more packets. This
* is usually due to executing BGP_Stop or a stub deletion.
*/
if (fsm_update_result == FSM_PEER_TRANSFERRED
|| fsm_update_result == FSM_PEER_STOPPED)
break;
}
if (fsm_update_result == FSM_PEER_TRANSFERRED ||
fsm_update_result == FSM_PEER_STOPPED) {
frr_with_mutex (&bm->peer_connection_mtx)
connection = peer_connection_fifo_pop(&bm->connection_fifo);
if (connection)
peer = connection->peer;
continue;
}
bool yield = event_should_yield(thread);
if (curr_connection_processed >= rpkt_quanta_old || yield) {
curr_connection_processed = 0;
frr_with_mutex (&bm->peer_connection_mtx) {
if (!peer_connection_fifo_member(&bm->connection_fifo, connection))
peer_connection_fifo_add_tail(&bm->connection_fifo,
connection);
if (!yield)
connection = peer_connection_fifo_pop(&bm->connection_fifo);
else
connection = NULL;
}
if (connection)
peer = connection->peer;
continue;
}
if (fsm_update_result != FSM_PEER_TRANSFERRED
&& fsm_update_result != FSM_PEER_STOPPED) {
frr_with_mutex (&connection->io_mtx) {
// more work to do, come back later
if (connection->ibuf->count > 0)
event_add_event(bm->master, bgp_process_packet,
connection, 0,
&connection->t_process_packet);
more_work = true;
else
more_work = false;
}
if (!more_work) {
frr_with_mutex (&bm->peer_connection_mtx)
connection = peer_connection_fifo_pop(&bm->connection_fifo);
if (connection)
peer = connection->peer;
}
}
if (connection) {
frr_with_mutex (&connection->io_mtx) {
if (connection->ibuf->count > 0)
more_work = true;
else
more_work = false;
}
frr_with_mutex (&bm->peer_connection_mtx) {
if (more_work &&
!peer_connection_fifo_member(&bm->connection_fifo, connection))
peer_connection_fifo_add_tail(&bm->connection_fifo, connection);
}
}
done:
frr_with_mutex (&bm->peer_connection_mtx)
count = peer_connection_fifo_count(&bm->connection_fifo);
if (count)
event_add_event(bm->master, bgp_process_packet, NULL, 0, &bm->e_process_packet);
}
/* Send EOR when routes are processed by selection deferral timer */

View file

@ -8683,6 +8683,10 @@ void bgp_master_init(struct event_loop *master, const int buffer_size,
bm = &bgp_master;
/* Initialize the peer connection FIFO list */
peer_connection_fifo_init(&bm->connection_fifo);
pthread_mutex_init(&bm->peer_connection_mtx, NULL);
zebra_announce_init(&bm->zebra_announce_head);
zebra_l2_vni_init(&bm->zebra_l2_vni_head);
zebra_l3_vni_init(&bm->zebra_l3_vni_head);

View file

@ -107,6 +107,9 @@ enum bgp_af_index {
extern struct frr_pthread *bgp_pth_io;
extern struct frr_pthread *bgp_pth_ka;
/* FIFO list for peer connections */
PREDECL_LIST(peer_connection_fifo);
/* BGP master for system wide configurations and variables. */
struct bgp_master {
/* BGP instance list. */
@ -121,6 +124,11 @@ struct bgp_master {
/* BGP port number. */
uint16_t port;
/* FIFO list head for peer connections */
struct peer_connection_fifo_head connection_fifo;
struct event *e_process_packet;
pthread_mutex_t peer_connection_mtx;
/* Listener addresses */
struct list *addresses;
@ -1378,7 +1386,6 @@ struct peer_connection {
struct event *t_pmax_restart;
struct event *t_routeadv;
struct event *t_process_packet;
struct event *t_stop_with_notify;
@ -1394,7 +1401,14 @@ struct peer_connection {
union sockunion *su_local; /* Sockunion of local address. */
union sockunion *su_remote; /* Sockunion of remote address. */
/* For FIFO list */
struct peer_connection_fifo_item fifo_item;
};
/* Declare the FIFO list implementation */
DECLARE_LIST(peer_connection_fifo, struct peer_connection, fifo_item);
const char *bgp_peer_get_connection_direction(struct peer_connection *connection);
extern struct peer_connection *bgp_peer_connection_new(struct peer *peer);
extern void bgp_peer_connection_free(struct peer_connection **connection);