2023-02-08 13:17:09 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2017-05-17 19:17:18 +02:00
|
|
|
/* BGP I/O.
|
2017-06-08 23:47:33 +02:00
|
|
|
* Implements packet I/O in a pthread.
|
2017-05-17 19:17:18 +02:00
|
|
|
* Copyright (C) 2017 Cumulus Networks
|
2017-06-08 23:47:33 +02:00
|
|
|
* Quentin Young
|
2017-04-18 20:11:43 +02:00
|
|
|
*/
|
|
|
|
|
2017-06-08 23:25:23 +02:00
|
|
|
/* clang-format off */
|
2017-06-01 18:26:49 +02:00
|
|
|
#include <zebra.h>
|
2017-06-08 23:25:23 +02:00
|
|
|
#include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
|
2019-10-04 20:52:24 +02:00
|
|
|
#include <sys/uio.h> // for writev
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2018-09-12 21:23:52 +02:00
|
|
|
#include "frr_pthread.h"
|
2017-06-08 23:25:23 +02:00
|
|
|
#include "linklist.h" // for list_delete, list_delete_all_node, lis...
|
|
|
|
#include "log.h" // for zlog_debug, safe_strerror, zlog_err
|
|
|
|
#include "memory.h" // for MTYPE_TMP, XCALLOC, XFREE
|
|
|
|
#include "network.h" // for ERRNO_IO_RETRY
|
|
|
|
#include "stream.h" // for stream_get_endp, stream_getw_from, str...
|
2018-01-02 19:20:00 +01:00
|
|
|
#include "ringbuf.h" // for ringbuf_remain, ringbuf_peek, ringbuf_...
|
2023-03-07 20:22:48 +01:00
|
|
|
#include "frrevent.h" // for EVENT_OFF, EVENT_ARG, thread...
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2017-06-01 18:26:49 +02:00
|
|
|
#include "bgpd/bgp_io.h"
|
2017-06-08 23:25:23 +02:00
|
|
|
#include "bgpd/bgp_debug.h" // for bgp_debug_neighbor_events, bgp_type_str
|
2018-06-15 23:08:53 +02:00
|
|
|
#include "bgpd/bgp_errors.h" // for expanded error reference information
|
2017-06-08 23:25:23 +02:00
|
|
|
#include "bgpd/bgp_fsm.h" // for BGP_EVENT_ADD, bgp_event
|
2022-09-08 22:14:36 +02:00
|
|
|
#include "bgpd/bgp_packet.h" // for bgp_notify_io_invalid...
|
2020-09-29 00:13:27 +02:00
|
|
|
#include "bgpd/bgp_trace.h" // for frrtraces
|
2017-06-08 23:25:23 +02:00
|
|
|
#include "bgpd/bgpd.h" // for peer, BGP_MARKER_SIZE, bgp_master, bm
|
|
|
|
/* clang-format on */
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2017-05-02 02:37:45 +02:00
|
|
|
/* forward declarations */
|
2021-04-30 22:39:40 +02:00
|
|
|
static uint16_t bgp_write(struct peer_connection *connection);
|
|
|
|
static uint16_t bgp_read(struct peer_connection *connection, int *code_p);
|
2022-03-01 22:18:12 +01:00
|
|
|
static void bgp_process_writes(struct event *event);
|
|
|
|
static void bgp_process_reads(struct event *event);
|
2021-04-30 22:39:40 +02:00
|
|
|
static bool validate_header(struct peer_connection *connection);
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2017-05-02 02:37:45 +02:00
|
|
|
/* generic i/o status codes */
|
2022-10-24 23:35:04 +02:00
|
|
|
#define BGP_IO_TRANS_ERR (1 << 0) /* EAGAIN or similar occurred */
|
|
|
|
#define BGP_IO_FATAL_ERR (1 << 1) /* some kind of fatal TCP error */
|
|
|
|
#define BGP_IO_WORK_FULL_ERR (1 << 2) /* No room in work buffer */
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2018-01-24 17:07:27 +01:00
|
|
|
/* Thread external API ----------------------------------------------------- */
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
void bgp_writes_on(struct peer_connection *connection)
|
2017-04-18 20:11:43 +02:00
|
|
|
{
|
2018-09-12 21:23:52 +02:00
|
|
|
struct frr_pthread *fpt = bgp_pth_io;
|
2021-04-30 22:39:40 +02:00
|
|
|
struct peer *peer = connection->peer;
|
|
|
|
|
2018-01-24 17:07:27 +01:00
|
|
|
assert(fpt->running);
|
2018-01-09 20:27:44 +01:00
|
|
|
|
2017-05-02 02:37:45 +02:00
|
|
|
assert(peer->status != Deleted);
|
2021-04-30 22:39:40 +02:00
|
|
|
assert(connection->obuf);
|
|
|
|
assert(connection->ibuf);
|
|
|
|
assert(connection->ibuf_work);
|
2017-09-25 04:18:15 +02:00
|
|
|
assert(!peer->t_connect_check_r);
|
|
|
|
assert(!peer->t_connect_check_w);
|
2021-04-30 22:39:40 +02:00
|
|
|
assert(connection->fd);
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
event_add_write(fpt->master, bgp_process_writes, connection,
|
2021-05-01 01:02:00 +02:00
|
|
|
connection->fd, &connection->t_write);
|
2021-06-03 21:20:11 +02:00
|
|
|
SET_FLAG(connection->thread_flags, PEER_THREAD_WRITES_ON);
|
2017-05-02 02:37:45 +02:00
|
|
|
}
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
void bgp_writes_off(struct peer_connection *connection)
|
2017-05-02 02:37:45 +02:00
|
|
|
{
|
2021-04-30 22:39:40 +02:00
|
|
|
struct peer *peer = connection->peer;
|
2018-09-12 21:23:52 +02:00
|
|
|
struct frr_pthread *fpt = bgp_pth_io;
|
2018-01-24 17:07:27 +01:00
|
|
|
assert(fpt->running);
|
2017-06-07 23:29:48 +02:00
|
|
|
|
2021-05-01 01:02:00 +02:00
|
|
|
event_cancel_async(fpt->master, &connection->t_write, NULL);
|
2022-12-25 16:26:52 +01:00
|
|
|
EVENT_OFF(peer->t_generate_updgrp_packets);
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2021-06-03 21:20:11 +02:00
|
|
|
UNSET_FLAG(peer->connection.thread_flags, PEER_THREAD_WRITES_ON);
|
2017-04-18 20:11:43 +02:00
|
|
|
}
|
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
void bgp_reads_on(struct peer_connection *connection)
|
2017-04-18 20:11:43 +02:00
|
|
|
{
|
2021-04-30 22:39:40 +02:00
|
|
|
struct peer *peer = connection->peer;
|
2018-09-12 21:23:52 +02:00
|
|
|
struct frr_pthread *fpt = bgp_pth_io;
|
2018-01-24 17:07:27 +01:00
|
|
|
assert(fpt->running);
|
2018-01-09 20:27:44 +01:00
|
|
|
|
2017-05-02 02:37:45 +02:00
|
|
|
assert(peer->status != Deleted);
|
2021-04-30 22:39:40 +02:00
|
|
|
assert(connection->ibuf);
|
|
|
|
assert(connection->fd);
|
|
|
|
assert(connection->ibuf_work);
|
|
|
|
assert(connection->obuf);
|
2017-09-25 04:18:15 +02:00
|
|
|
assert(!peer->t_connect_check_r);
|
|
|
|
assert(!peer->t_connect_check_w);
|
2021-04-30 22:39:40 +02:00
|
|
|
assert(connection->fd);
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
event_add_read(fpt->master, bgp_process_reads, connection,
|
2021-05-01 01:02:00 +02:00
|
|
|
connection->fd, &connection->t_read);
|
2017-06-08 23:14:18 +02:00
|
|
|
|
2021-06-03 21:20:11 +02:00
|
|
|
SET_FLAG(connection->thread_flags, PEER_THREAD_READS_ON);
|
2017-04-18 20:11:43 +02:00
|
|
|
}
|
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
void bgp_reads_off(struct peer_connection *connection)
|
2017-04-18 20:11:43 +02:00
|
|
|
{
|
2021-04-30 22:39:40 +02:00
|
|
|
struct peer *peer = connection->peer;
|
2018-09-12 21:23:52 +02:00
|
|
|
struct frr_pthread *fpt = bgp_pth_io;
|
2018-01-24 17:07:27 +01:00
|
|
|
assert(fpt->running);
|
2017-06-07 23:29:48 +02:00
|
|
|
|
2021-05-01 01:02:00 +02:00
|
|
|
event_cancel_async(fpt->master, &connection->t_read, NULL);
|
2022-12-25 16:26:52 +01:00
|
|
|
EVENT_OFF(peer->t_process_packet);
|
|
|
|
EVENT_OFF(peer->t_process_packet_error);
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2021-06-03 21:20:11 +02:00
|
|
|
UNSET_FLAG(connection->thread_flags, PEER_THREAD_READS_ON);
|
2017-04-18 20:11:43 +02:00
|
|
|
}
|
|
|
|
|
2018-01-24 17:07:27 +01:00
|
|
|
/* Thread internal functions ----------------------------------------------- */
|
2017-06-08 23:47:33 +02:00
|
|
|
|
2018-01-24 17:07:27 +01:00
|
|
|
/*
|
2017-06-08 23:47:33 +02:00
|
|
|
* Called from I/O pthread when a file descriptor has become ready for writing.
|
2017-05-02 02:37:45 +02:00
|
|
|
*/
|
2022-03-01 22:18:12 +01:00
|
|
|
static void bgp_process_writes(struct event *thread)
|
2017-04-18 20:11:43 +02:00
|
|
|
{
|
2017-05-02 02:37:45 +02:00
|
|
|
static struct peer *peer;
|
2021-04-30 22:39:40 +02:00
|
|
|
struct peer_connection *connection = EVENT_ARG(thread);
|
2017-05-02 02:37:45 +02:00
|
|
|
uint16_t status;
|
2017-06-08 23:14:18 +02:00
|
|
|
bool reschedule;
|
2017-06-12 23:16:40 +02:00
|
|
|
bool fatal = false;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
peer = connection->peer;
|
|
|
|
|
|
|
|
if (connection->fd < 0)
|
2022-02-23 01:04:25 +01:00
|
|
|
return;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2018-09-12 21:23:52 +02:00
|
|
|
struct frr_pthread *fpt = bgp_pth_io;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
frr_with_mutex (&connection->io_mtx) {
|
|
|
|
status = bgp_write(connection);
|
|
|
|
reschedule = (stream_fifo_head(connection->obuf) != NULL);
|
2017-05-02 02:37:45 +02:00
|
|
|
}
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2018-01-24 17:07:27 +01:00
|
|
|
/* no problem */
|
|
|
|
if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
|
2017-04-18 20:11:43 +02:00
|
|
|
}
|
|
|
|
|
2018-01-24 17:07:27 +01:00
|
|
|
/* problem */
|
2017-06-12 23:16:40 +02:00
|
|
|
if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
|
2018-01-24 17:07:27 +01:00
|
|
|
reschedule = false;
|
2017-06-12 23:16:40 +02:00
|
|
|
fatal = true;
|
|
|
|
}
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2020-11-06 04:25:56 +01:00
|
|
|
/* If suppress fib pending is enabled, route is advertised to peers when
|
|
|
|
* the status is received from the FIB. The delay is added
|
|
|
|
* to update group packet generate which will allow more routes to be
|
|
|
|
* sent in the update message
|
|
|
|
*/
|
2017-05-02 02:37:45 +02:00
|
|
|
if (reschedule) {
|
2021-04-30 22:39:40 +02:00
|
|
|
event_add_write(fpt->master, bgp_process_writes, connection,
|
2021-05-01 01:02:00 +02:00
|
|
|
connection->fd, &connection->t_write);
|
2017-11-10 23:03:58 +01:00
|
|
|
} else if (!fatal) {
|
2020-11-06 04:25:56 +01:00
|
|
|
BGP_UPDATE_GROUP_TIMER_ON(&peer->t_generate_updgrp_packets,
|
|
|
|
bgp_generate_updgrp_packets);
|
2017-05-02 02:37:45 +02:00
|
|
|
}
|
2017-04-18 20:11:43 +02:00
|
|
|
}
|
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
static int read_ibuf_work(struct peer_connection *connection)
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
{
|
|
|
|
/* static buffer for transferring packets */
|
|
|
|
/* shorter alias to peer's input buffer */
|
2021-04-30 22:39:40 +02:00
|
|
|
struct ringbuf *ibw = connection->ibuf_work;
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
/* packet size as given by header */
|
|
|
|
uint16_t pktsize = 0;
|
|
|
|
struct stream *pkt;
|
|
|
|
|
|
|
|
/* ============================================== */
|
2021-04-30 22:39:40 +02:00
|
|
|
frr_with_mutex (&connection->io_mtx) {
|
|
|
|
if (connection->ibuf->count >= bm->inq_limit)
|
2023-02-02 20:13:12 +01:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
|
|
|
|
/* check that we have enough data for a header */
|
|
|
|
if (ringbuf_remain(ibw) < BGP_HEADER_SIZE)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* check that header is valid */
|
2021-04-30 22:39:40 +02:00
|
|
|
if (!validate_header(connection))
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
return -EBADMSG;
|
|
|
|
|
|
|
|
/* header is valid; retrieve packet size */
|
|
|
|
ringbuf_peek(ibw, BGP_MARKER_SIZE, &pktsize, sizeof(pktsize));
|
|
|
|
|
|
|
|
pktsize = ntohs(pktsize);
|
|
|
|
|
|
|
|
/* if this fails we are seriously screwed */
|
2021-04-30 22:39:40 +02:00
|
|
|
assert(pktsize <= connection->peer->max_packet_size);
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have that much data, chuck it into its own
|
|
|
|
* stream and append to input queue for processing.
|
|
|
|
*
|
|
|
|
* Otherwise, come back later.
|
|
|
|
*/
|
|
|
|
if (ringbuf_remain(ibw) < pktsize)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
pkt = stream_new(pktsize);
|
|
|
|
assert(STREAM_WRITEABLE(pkt) == pktsize);
|
|
|
|
assert(ringbuf_get(ibw, pkt->data, pktsize) == pktsize);
|
|
|
|
stream_set_endp(pkt, pktsize);
|
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
frrtrace(2, frr_bgp, packet_read, connection->peer, pkt);
|
|
|
|
frr_with_mutex (&connection->io_mtx) {
|
|
|
|
stream_fifo_push(connection->ibuf, pkt);
|
2023-02-02 20:13:12 +01:00
|
|
|
}
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
|
|
|
|
return pktsize;
|
|
|
|
}
|
|
|
|
|
2018-01-24 17:07:27 +01:00
|
|
|
/*
|
2017-06-08 23:47:33 +02:00
|
|
|
* Called from I/O pthread when a file descriptor has become ready for reading,
|
|
|
|
* or has hung up.
|
2017-06-02 03:52:39 +02:00
|
|
|
*
|
|
|
|
* We read as much data as possible, process as many packets as we can and
|
2021-04-30 20:55:40 +02:00
|
|
|
* place them on peer->connection.ibuf for secondary processing by the main
|
|
|
|
* thread.
|
2017-04-18 20:11:43 +02:00
|
|
|
*/
|
2022-03-01 22:18:12 +01:00
|
|
|
static void bgp_process_reads(struct event *thread)
|
2017-04-18 20:11:43 +02:00
|
|
|
{
|
2017-06-16 22:15:31 +02:00
|
|
|
/* clang-format off */
|
2021-04-30 22:39:40 +02:00
|
|
|
struct peer_connection *connection = EVENT_ARG(thread);
|
2022-10-24 23:35:04 +02:00
|
|
|
static struct peer *peer; /* peer to read from */
|
|
|
|
uint16_t status; /* bgp_read status code */
|
|
|
|
bool fatal = false; /* whether fatal error occurred */
|
2021-04-30 20:55:40 +02:00
|
|
|
bool added_pkt = false; /* whether we pushed onto ->connection.ibuf */
|
2022-10-24 23:35:04 +02:00
|
|
|
int code = 0; /* FSM code if error occurred */
|
|
|
|
static bool ibuf_full_logged; /* Have we logged full already */
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
int ret = 1;
|
2017-06-16 22:15:31 +02:00
|
|
|
/* clang-format on */
|
2017-06-02 03:52:39 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
peer = connection->peer;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
if (bm->terminating || connection->fd < 0)
|
2022-02-23 01:04:25 +01:00
|
|
|
return;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2018-09-12 21:23:52 +02:00
|
|
|
struct frr_pthread *fpt = bgp_pth_io;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
frr_with_mutex (&connection->io_mtx) {
|
|
|
|
status = bgp_read(connection, &code);
|
2017-05-02 02:37:45 +02:00
|
|
|
}
|
|
|
|
|
2017-06-02 03:52:39 +02:00
|
|
|
/* error checking phase */
|
|
|
|
if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
|
|
|
|
/* no problem; just don't process packets */
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
goto done;
|
2017-06-02 03:52:39 +02:00
|
|
|
}
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2017-06-02 03:52:39 +02:00
|
|
|
if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
|
|
|
|
/* problem; tear down session */
|
|
|
|
fatal = true;
|
2021-03-09 17:13:41 +01:00
|
|
|
|
|
|
|
/* Handle the error in the main pthread, include the
|
|
|
|
* specific state change from 'bgp_read'.
|
|
|
|
*/
|
2021-04-30 22:39:40 +02:00
|
|
|
event_add_event(bm->master, bgp_packet_process_error,
|
|
|
|
connection, code, &peer->t_process_packet_error);
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
goto done;
|
2017-04-18 20:11:43 +02:00
|
|
|
}
|
|
|
|
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
while (true) {
|
2021-04-30 22:39:40 +02:00
|
|
|
ret = read_ibuf_work(connection);
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
if (ret <= 0)
|
2017-06-02 03:52:39 +02:00
|
|
|
break;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
added_pkt = true;
|
|
|
|
}
|
2017-06-02 03:52:39 +02:00
|
|
|
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
switch (ret) {
|
|
|
|
case -EBADMSG:
|
|
|
|
fatal = true;
|
|
|
|
break;
|
|
|
|
case -ENOMEM:
|
|
|
|
if (!ibuf_full_logged) {
|
2022-11-15 21:28:09 +01:00
|
|
|
if (bgp_debug_neighbor_events(peer))
|
|
|
|
zlog_debug(
|
|
|
|
"%s [Event] Peer Input-Queue is full: limit (%u)",
|
|
|
|
peer->host, bm->inq_limit);
|
|
|
|
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
ibuf_full_logged = true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
ibuf_full_logged = false;
|
|
|
|
break;
|
2017-06-02 03:52:39 +02:00
|
|
|
}
|
|
|
|
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
done:
|
2017-06-02 03:52:39 +02:00
|
|
|
/* handle invalid header */
|
|
|
|
if (fatal) {
|
|
|
|
/* wipe buffer just in case someone screwed up */
|
2021-04-30 22:39:40 +02:00
|
|
|
ringbuf_wipe(connection->ibuf_work);
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
event_add_read(fpt->master, bgp_process_reads, peer, connection->fd,
|
2021-05-01 01:02:00 +02:00
|
|
|
&connection->t_read);
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
if (added_pkt)
|
2021-04-30 22:39:40 +02:00
|
|
|
event_add_event(bm->master, bgp_process_packet, connection, 0,
|
2022-05-20 20:19:08 +02:00
|
|
|
&peer->t_process_packet);
|
2017-04-18 20:11:43 +02:00
|
|
|
}
|
|
|
|
|
2018-01-24 17:07:27 +01:00
|
|
|
/*
|
2017-04-18 20:11:43 +02:00
|
|
|
* Flush peer output buffer.
|
|
|
|
*
|
2021-04-30 20:55:40 +02:00
|
|
|
* This function pops packets off of peer->connection.obuf and writes them to
|
|
|
|
* peer->connection.fd. The amount of packets written is equal to the minimum of
|
|
|
|
* peer->wpkt_quanta and the number of packets on the output buffer, unless an
|
|
|
|
* error occurs.
|
2017-04-18 20:11:43 +02:00
|
|
|
*
|
|
|
|
* If write() returns an error, the appropriate FSM event is generated.
|
|
|
|
*
|
|
|
|
* The return value is equal to the number of packets written
|
|
|
|
* (which may be zero).
|
|
|
|
*/
|
2021-04-30 22:39:40 +02:00
|
|
|
static uint16_t bgp_write(struct peer_connection *connection)
|
2017-04-18 20:11:43 +02:00
|
|
|
{
|
2021-04-30 22:39:40 +02:00
|
|
|
struct peer *peer = connection->peer;
|
2018-03-27 21:13:34 +02:00
|
|
|
uint8_t type;
|
2017-04-18 20:11:43 +02:00
|
|
|
struct stream *s;
|
|
|
|
int update_last_write = 0;
|
2019-10-04 20:52:24 +02:00
|
|
|
unsigned int count;
|
2018-01-07 15:41:53 +01:00
|
|
|
uint32_t uo = 0;
|
2017-05-02 02:37:45 +02:00
|
|
|
uint16_t status = 0;
|
2017-06-05 22:14:47 +02:00
|
|
|
uint32_t wpkt_quanta_old;
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2019-10-04 20:52:24 +02:00
|
|
|
int writenum = 0;
|
|
|
|
int num;
|
|
|
|
unsigned int iovsz;
|
|
|
|
unsigned int strmsz;
|
|
|
|
unsigned int total_written;
|
2021-04-22 11:04:52 +02:00
|
|
|
time_t now;
|
2019-10-04 20:52:24 +02:00
|
|
|
|
2017-07-05 17:38:57 +02:00
|
|
|
wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta,
|
|
|
|
memory_order_relaxed);
|
2019-10-04 20:52:24 +02:00
|
|
|
struct stream *ostreams[wpkt_quanta_old];
|
|
|
|
struct stream **streams = ostreams;
|
|
|
|
struct iovec iov[wpkt_quanta_old];
|
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
s = stream_fifo_head(connection->obuf);
|
2019-10-04 20:52:24 +02:00
|
|
|
|
|
|
|
if (!s)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
count = iovsz = 0;
|
|
|
|
while (count < wpkt_quanta_old && iovsz < array_size(iov) && s) {
|
|
|
|
ostreams[iovsz] = s;
|
|
|
|
iov[iovsz].iov_base = stream_pnt(s);
|
|
|
|
iov[iovsz].iov_len = STREAM_READABLE(s);
|
|
|
|
writenum += STREAM_READABLE(s);
|
|
|
|
s = s->next;
|
|
|
|
++iovsz;
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
|
|
|
|
strmsz = iovsz;
|
|
|
|
total_written = 0;
|
|
|
|
|
|
|
|
do {
|
2021-04-30 22:39:40 +02:00
|
|
|
num = writev(connection->fd, iov, iovsz);
|
2019-10-04 20:52:24 +02:00
|
|
|
|
|
|
|
if (num < 0) {
|
|
|
|
if (!ERRNO_IO_RETRY(errno)) {
|
|
|
|
BGP_EVENT_ADD(peer, TCP_fatal_error);
|
|
|
|
SET_FLAG(status, BGP_IO_FATAL_ERR);
|
|
|
|
} else {
|
|
|
|
SET_FLAG(status, BGP_IO_TRANS_ERR);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
} else if (num != writenum) {
|
|
|
|
unsigned int msg_written = 0;
|
|
|
|
unsigned int ic = iovsz;
|
|
|
|
|
|
|
|
for (unsigned int i = 0; i < ic; i++) {
|
|
|
|
size_t ss = iov[i].iov_len;
|
2017-06-05 22:14:47 +02:00
|
|
|
|
2019-10-04 20:52:24 +02:00
|
|
|
if (ss > (unsigned int) num)
|
|
|
|
break;
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2019-10-04 20:52:24 +02:00
|
|
|
msg_written++;
|
|
|
|
iovsz--;
|
|
|
|
writenum -= ss;
|
|
|
|
num -= ss;
|
|
|
|
}
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2019-10-04 20:52:24 +02:00
|
|
|
total_written += msg_written;
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2019-10-15 20:25:02 +02:00
|
|
|
assert(total_written < count);
|
|
|
|
|
2019-10-04 20:52:24 +02:00
|
|
|
memmove(&iov, &iov[msg_written],
|
|
|
|
sizeof(iov[0]) * iovsz);
|
|
|
|
streams = &streams[msg_written];
|
|
|
|
stream_forward_getp(streams[0], num);
|
|
|
|
iov[0].iov_base = stream_pnt(streams[0]);
|
|
|
|
iov[0].iov_len = STREAM_READABLE(streams[0]);
|
|
|
|
|
|
|
|
writenum -= num;
|
|
|
|
num = 0;
|
|
|
|
assert(writenum > 0);
|
|
|
|
} else {
|
|
|
|
total_written = strmsz;
|
|
|
|
}
|
|
|
|
|
|
|
|
} while (num != writenum);
|
|
|
|
|
|
|
|
/* Handle statistics */
|
|
|
|
for (unsigned int i = 0; i < total_written; i++) {
|
2021-04-30 22:39:40 +02:00
|
|
|
s = stream_fifo_pop(connection->obuf);
|
2019-10-04 20:52:24 +02:00
|
|
|
|
|
|
|
assert(s == ostreams[i]);
|
2017-04-18 20:11:43 +02:00
|
|
|
|
|
|
|
/* Retrieve BGP packet type. */
|
|
|
|
stream_set_getp(s, BGP_MARKER_SIZE + 2);
|
|
|
|
type = stream_getc(s);
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case BGP_MSG_OPEN:
|
2017-07-05 17:38:57 +02:00
|
|
|
atomic_fetch_add_explicit(&peer->open_out, 1,
|
|
|
|
memory_order_relaxed);
|
2017-04-18 20:11:43 +02:00
|
|
|
break;
|
|
|
|
case BGP_MSG_UPDATE:
|
2017-07-05 17:38:57 +02:00
|
|
|
atomic_fetch_add_explicit(&peer->update_out, 1,
|
|
|
|
memory_order_relaxed);
|
2018-01-07 15:41:53 +01:00
|
|
|
uo++;
|
2017-04-18 20:11:43 +02:00
|
|
|
break;
|
|
|
|
case BGP_MSG_NOTIFY:
|
2017-07-05 17:38:57 +02:00
|
|
|
atomic_fetch_add_explicit(&peer->notify_out, 1,
|
|
|
|
memory_order_relaxed);
|
2017-04-18 20:11:43 +02:00
|
|
|
/* Double start timer. */
|
|
|
|
peer->v_start *= 2;
|
|
|
|
|
|
|
|
/* Overflow check. */
|
|
|
|
if (peer->v_start >= (60 * 2))
|
|
|
|
peer->v_start = (60 * 2);
|
|
|
|
|
2018-01-24 17:07:27 +01:00
|
|
|
/*
|
|
|
|
* Handle Graceful Restart case where the state changes
|
|
|
|
* to Connect instead of Idle.
|
|
|
|
*/
|
2017-04-18 20:11:43 +02:00
|
|
|
BGP_EVENT_ADD(peer, BGP_Stop);
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
case BGP_MSG_KEEPALIVE:
|
2017-07-05 17:38:57 +02:00
|
|
|
atomic_fetch_add_explicit(&peer->keepalive_out, 1,
|
|
|
|
memory_order_relaxed);
|
2017-04-18 20:11:43 +02:00
|
|
|
break;
|
|
|
|
case BGP_MSG_ROUTE_REFRESH_NEW:
|
|
|
|
case BGP_MSG_ROUTE_REFRESH_OLD:
|
2017-07-05 17:38:57 +02:00
|
|
|
atomic_fetch_add_explicit(&peer->refresh_out, 1,
|
|
|
|
memory_order_relaxed);
|
2017-04-18 20:11:43 +02:00
|
|
|
break;
|
|
|
|
case BGP_MSG_CAPABILITY:
|
2017-07-05 17:38:57 +02:00
|
|
|
atomic_fetch_add_explicit(&peer->dynamic_cap_out, 1,
|
|
|
|
memory_order_relaxed);
|
2017-04-18 20:11:43 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-10-04 20:52:24 +02:00
|
|
|
stream_free(s);
|
|
|
|
ostreams[i] = NULL;
|
2017-04-18 20:11:43 +02:00
|
|
|
update_last_write = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
done : {
|
2022-08-18 00:27:54 +02:00
|
|
|
now = monotime(NULL);
|
2018-01-07 15:41:53 +01:00
|
|
|
/*
|
|
|
|
* Update last_update if UPDATEs were written.
|
|
|
|
* Note: that these are only updated at end,
|
|
|
|
* not per message (i.e., per loop)
|
|
|
|
*/
|
|
|
|
if (uo)
|
2021-04-22 11:04:52 +02:00
|
|
|
atomic_store_explicit(&peer->last_update, now,
|
2017-07-05 17:38:57 +02:00
|
|
|
memory_order_relaxed);
|
2017-04-18 20:11:43 +02:00
|
|
|
|
2017-11-06 07:41:27 +01:00
|
|
|
/* If we TXed any flavor of packet */
|
2021-04-22 11:04:52 +02:00
|
|
|
if (update_last_write) {
|
|
|
|
atomic_store_explicit(&peer->last_write, now,
|
2017-07-05 17:38:57 +02:00
|
|
|
memory_order_relaxed);
|
2021-04-22 11:04:52 +02:00
|
|
|
peer->last_sendq_ok = now;
|
|
|
|
}
|
2017-04-18 20:11:43 +02:00
|
|
|
}
|
|
|
|
|
2017-05-02 02:37:45 +02:00
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2023-07-21 19:10:03 +02:00
|
|
|
uint8_t ibuf_scratch[BGP_EXTENDED_MESSAGE_MAX_PACKET_SIZE * BGP_READ_PACKET_MAX];
|
2018-01-24 17:07:27 +01:00
|
|
|
/*
|
2021-04-30 20:55:40 +02:00
|
|
|
* Reads a chunk of data from peer->connection.fd into
|
|
|
|
* peer->connection.ibuf_work.
|
2017-05-02 02:37:45 +02:00
|
|
|
*
|
2021-04-16 00:05:10 +02:00
|
|
|
* code_p
|
|
|
|
* Pointer to location to store FSM event code in case of fatal error.
|
|
|
|
*
|
2017-06-08 23:47:33 +02:00
|
|
|
* @return status flag (see top-of-file)
|
2023-07-21 19:10:03 +02:00
|
|
|
*
|
|
|
|
* PLEASE NOTE: If we ever transform the bgp_read to be a pthread
|
|
|
|
* per peer then we need to rethink the global ibuf_scratch
|
|
|
|
* data structure above.
|
2017-05-02 02:37:45 +02:00
|
|
|
*/
|
2021-04-30 22:39:40 +02:00
|
|
|
static uint16_t bgp_read(struct peer_connection *connection, int *code_p)
|
2017-05-02 02:37:45 +02:00
|
|
|
{
|
2022-10-24 23:35:04 +02:00
|
|
|
size_t readsize; /* how many bytes we want to read */
|
|
|
|
ssize_t nbytes; /* how many bytes we actually read */
|
|
|
|
size_t ibuf_work_space; /* space we can read into the work buf */
|
2017-05-02 02:37:45 +02:00
|
|
|
uint16_t status = 0;
|
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
ibuf_work_space = ringbuf_space(connection->ibuf_work);
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
|
|
|
|
if (ibuf_work_space == 0) {
|
|
|
|
SET_FLAG(status, BGP_IO_WORK_FULL_ERR);
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2023-07-21 19:10:03 +02:00
|
|
|
readsize = MIN(ibuf_work_space, sizeof(ibuf_scratch));
|
bgpd,doc: limit InQ buf to allow for back pressure
Add a default limit to the InQ for messages off the bgp peer
socket. Make the limit configurable via cli.
Adding in this limit causes the messages to be retained in the tcp
socket and allow for tcp back pressure and congestion control to kick
in.
Before this change, we allow the InQ to grow indefinitely just taking
messages off the socket and adding them to the fifo queue, never letting
the kernel know we need to slow down. We were seeing under high loads of
messages and large perf-heavy routemaps (regex matching) this queue
would cause a memory spike and BGP would get OOM killed. Modifying this
leaves the messages in the socket and distributes that load where it
should be in the socket buffers on both send/recv while we handle the
mesages.
Also, changes were made to allow the ringbuffer to hold messages and
continue to be filled by the IO pthread while we wait for the Main
pthread to handle the work on the InQ.
Memory spike seen with large numbers of routes flapping and route-maps
with dozens of regex matching:
```
Memory statistics for bgpd:
System allocator statistics:
Total heap allocated: > 2GB
Holding block headers: 516 KiB
Used small blocks: 0 bytes
Used ordinary blocks: 160 MiB
Free small blocks: 3680 bytes
Free ordinary blocks: > 2GB
Ordinary blocks: 121244
Small blocks: 83
Holding blocks: 1
```
With most of it being held by the inQ (seen from the stream datastructure info here):
```
Type : Current# Size Total Max# MaxBytes
...
...
Stream : 115543 variable 26963208 15970740 3571708768
```
With this change that memory is capped and load is left in the sockets:
RECV Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 265350 0 [fe80::4080:30ff:feb0:cee3]%veth1:36950 [fe80::4c14:9cff:fe1d:5bfd]:179 users:(("bgpd",pid=1393334,fd=26))
skmem:(r403688,rb425984,t0,tb425984,f1816,w0,o0,bl0,d61)
```
SEND Side:
```
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
ESTAB 0 1275012 [fe80::4c14:9cff:fe1d:5bfd]%veth1:179 [fe80::4080:30ff:feb0:cee3]:36950 users:(("bgpd",pid=1393443,fd=27))
skmem:(r0,rb131072,t0,tb1453568,f1916,w1300612,o0,bl0,d0)
```
Signed-off-by: Stephen Worley <sworley@nvidia.com>
2022-10-21 18:45:50 +02:00
|
|
|
|
2021-04-30 22:39:40 +02:00
|
|
|
nbytes = read(connection->fd, ibuf_scratch, readsize);
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2018-01-02 19:20:00 +01:00
|
|
|
/* EAGAIN or EWOULDBLOCK; come back later */
|
|
|
|
if (nbytes < 0 && ERRNO_IO_RETRY(errno)) {
|
|
|
|
SET_FLAG(status, BGP_IO_TRANS_ERR);
|
|
|
|
} else if (nbytes < 0) {
|
2021-03-09 17:13:41 +01:00
|
|
|
/* Fatal error; tear down session */
|
2018-09-13 20:23:42 +02:00
|
|
|
flog_err(EC_BGP_UPDATE_RCV,
|
2021-04-30 22:39:40 +02:00
|
|
|
"%s [Error] bgp_read_packet error: %s",
|
|
|
|
connection->peer->host, safe_strerror(errno));
|
2017-08-04 20:27:42 +02:00
|
|
|
|
2021-03-09 17:13:41 +01:00
|
|
|
/* Handle the error in the main pthread. */
|
|
|
|
if (code_p)
|
|
|
|
*code_p = TCP_fatal_error;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2017-08-04 20:27:42 +02:00
|
|
|
SET_FLAG(status, BGP_IO_FATAL_ERR);
|
2021-03-09 17:13:41 +01:00
|
|
|
|
2018-01-02 19:20:00 +01:00
|
|
|
} else if (nbytes == 0) {
|
2021-03-09 17:13:41 +01:00
|
|
|
/* Received EOF / TCP session closed */
|
2021-04-30 22:39:40 +02:00
|
|
|
if (bgp_debug_neighbor_events(connection->peer))
|
2017-08-04 20:27:42 +02:00
|
|
|
zlog_debug("%s [Event] BGP connection closed fd %d",
|
2021-04-30 22:39:40 +02:00
|
|
|
connection->peer->host, connection->fd);
|
2017-08-04 20:27:42 +02:00
|
|
|
|
2021-03-09 17:13:41 +01:00
|
|
|
/* Handle the error in the main pthread. */
|
|
|
|
if (code_p)
|
|
|
|
*code_p = TCP_connection_closed;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2017-08-04 20:27:42 +02:00
|
|
|
SET_FLAG(status, BGP_IO_FATAL_ERR);
|
2021-04-27 00:42:12 +02:00
|
|
|
} else {
|
2021-04-30 22:39:40 +02:00
|
|
|
assert(ringbuf_put(connection->ibuf_work, ibuf_scratch,
|
2021-04-30 20:55:40 +02:00
|
|
|
nbytes) == (size_t)nbytes);
|
2017-05-02 02:37:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called after we have read a BGP packet header. Validates marker, message
|
|
|
|
* type and packet length. If any of these aren't correct, sends a notify.
|
2018-01-02 19:20:00 +01:00
|
|
|
*
|
|
|
|
* Assumes that there are at least BGP_HEADER_SIZE readable bytes in the input
|
|
|
|
* buffer.
|
2017-05-02 02:37:45 +02:00
|
|
|
*/
|
2021-04-30 22:39:40 +02:00
|
|
|
static bool validate_header(struct peer_connection *connection)
|
2017-05-02 02:37:45 +02:00
|
|
|
{
|
2021-04-30 22:39:40 +02:00
|
|
|
struct peer *peer = connection->peer;
|
2017-09-20 17:11:30 +02:00
|
|
|
uint16_t size;
|
|
|
|
uint8_t type;
|
2021-04-30 22:39:40 +02:00
|
|
|
struct ringbuf *pkt = connection->ibuf_work;
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2019-11-20 17:26:59 +01:00
|
|
|
static const uint8_t m_correct[BGP_MARKER_SIZE] = {
|
2018-01-02 19:20:00 +01:00
|
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
|
|
|
|
uint8_t m_rx[BGP_MARKER_SIZE] = {0x00};
|
2017-06-01 18:20:58 +02:00
|
|
|
|
2018-01-02 19:20:00 +01:00
|
|
|
if (ringbuf_peek(pkt, 0, m_rx, BGP_MARKER_SIZE) != BGP_MARKER_SIZE)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (memcmp(m_correct, m_rx, BGP_MARKER_SIZE) != 0) {
|
2022-09-08 22:14:36 +02:00
|
|
|
bgp_notify_io_invalid(peer, BGP_NOTIFY_HEADER_ERR,
|
|
|
|
BGP_NOTIFY_HEADER_NOT_SYNC, NULL, 0);
|
2017-06-01 18:20:58 +02:00
|
|
|
return false;
|
|
|
|
}
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2018-01-02 19:20:00 +01:00
|
|
|
/* Get size and type in network byte order. */
|
|
|
|
ringbuf_peek(pkt, BGP_MARKER_SIZE, &size, sizeof(size));
|
|
|
|
ringbuf_peek(pkt, BGP_MARKER_SIZE + 2, &type, sizeof(type));
|
|
|
|
|
|
|
|
size = ntohs(size);
|
2017-05-02 02:37:45 +02:00
|
|
|
|
|
|
|
/* BGP type check. */
|
|
|
|
if (type != BGP_MSG_OPEN && type != BGP_MSG_UPDATE
|
|
|
|
&& type != BGP_MSG_NOTIFY && type != BGP_MSG_KEEPALIVE
|
|
|
|
&& type != BGP_MSG_ROUTE_REFRESH_NEW
|
|
|
|
&& type != BGP_MSG_ROUTE_REFRESH_OLD
|
|
|
|
&& type != BGP_MSG_CAPABILITY) {
|
2017-09-20 17:11:30 +02:00
|
|
|
if (bgp_debug_neighbor_events(peer))
|
2017-05-02 02:37:45 +02:00
|
|
|
zlog_debug("%s unknown message type 0x%02x", peer->host,
|
|
|
|
type);
|
|
|
|
|
2022-09-08 22:14:36 +02:00
|
|
|
bgp_notify_io_invalid(peer, BGP_NOTIFY_HEADER_ERR,
|
|
|
|
BGP_NOTIFY_HEADER_BAD_MESTYPE, &type, 1);
|
2017-05-02 02:37:45 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-09-20 17:11:30 +02:00
|
|
|
/* Minimum packet length check. */
|
2021-02-25 18:46:49 +01:00
|
|
|
if ((size < BGP_HEADER_SIZE) || (size > peer->max_packet_size)
|
2017-05-02 02:37:45 +02:00
|
|
|
|| (type == BGP_MSG_OPEN && size < BGP_MSG_OPEN_MIN_SIZE)
|
|
|
|
|| (type == BGP_MSG_UPDATE && size < BGP_MSG_UPDATE_MIN_SIZE)
|
|
|
|
|| (type == BGP_MSG_NOTIFY && size < BGP_MSG_NOTIFY_MIN_SIZE)
|
|
|
|
|| (type == BGP_MSG_KEEPALIVE && size != BGP_MSG_KEEPALIVE_MIN_SIZE)
|
|
|
|
|| (type == BGP_MSG_ROUTE_REFRESH_NEW
|
|
|
|
&& size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
|
|
|
|
|| (type == BGP_MSG_ROUTE_REFRESH_OLD
|
|
|
|
&& size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
|
|
|
|
|| (type == BGP_MSG_CAPABILITY
|
|
|
|
&& size < BGP_MSG_CAPABILITY_MIN_SIZE)) {
|
2017-07-05 17:38:57 +02:00
|
|
|
if (bgp_debug_neighbor_events(peer)) {
|
2017-05-02 02:37:45 +02:00
|
|
|
zlog_debug("%s bad message length - %d for %s",
|
|
|
|
peer->host, size,
|
|
|
|
type == 128 ? "ROUTE-REFRESH"
|
2017-07-05 17:38:57 +02:00
|
|
|
: bgp_type_str[(int)type]);
|
|
|
|
}
|
2017-05-02 02:37:45 +02:00
|
|
|
|
2017-09-20 17:11:30 +02:00
|
|
|
uint16_t nsize = htons(size);
|
|
|
|
|
2022-09-08 22:14:36 +02:00
|
|
|
bgp_notify_io_invalid(peer, BGP_NOTIFY_HEADER_ERR,
|
|
|
|
BGP_NOTIFY_HEADER_BAD_MESLEN,
|
|
|
|
(unsigned char *)&nsize, 2);
|
2017-05-02 02:37:45 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2017-04-18 20:11:43 +02:00
|
|
|
}
|