frr/bgpd/bgp_io.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

589 lines
16 KiB
C
Raw Normal View History

/* BGP I/O.
* Implements packet I/O in a pthread.
* Copyright (C) 2017 Cumulus Networks
* Quentin Young
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
* MA 02110-1301 USA
*/
/* clang-format off */
#include <zebra.h>
#include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
#include <sys/uio.h> // for writev
#include "frr_pthread.h"
#include "linklist.h" // for list_delete, list_delete_all_node, lis...
#include "log.h" // for zlog_debug, safe_strerror, zlog_err
#include "memory.h" // for MTYPE_TMP, XCALLOC, XFREE
#include "network.h" // for ERRNO_IO_RETRY
#include "stream.h" // for stream_get_endp, stream_getw_from, str...
#include "ringbuf.h" // for ringbuf_remain, ringbuf_peek, ringbuf_...
#include "thread.h" // for THREAD_OFF, THREAD_ARG, thread...
#include "zassert.h" // for assert
#include "bgpd/bgp_io.h"
#include "bgpd/bgp_debug.h" // for bgp_debug_neighbor_events, bgp_type_str
#include "bgpd/bgp_errors.h" // for expanded error reference information
#include "bgpd/bgp_fsm.h" // for BGP_EVENT_ADD, bgp_event
#include "bgpd/bgp_packet.h" // for bgp_notify_send_with_data, bgp_notify...
#include "bgpd/bgp_trace.h" // for frrtraces
#include "bgpd/bgpd.h" // for peer, BGP_MARKER_SIZE, bgp_master, bm
/* clang-format on */
/* forward declarations */
static uint16_t bgp_write(struct peer *);
static uint16_t bgp_read(struct peer *);
static int bgp_process_writes(struct thread *);
static int bgp_process_reads(struct thread *);
static bool validate_header(struct peer *);
/* generic i/o status codes */
#define BGP_IO_TRANS_ERR (1 << 0) // EAGAIN or similar occurred
#define BGP_IO_FATAL_ERR (1 << 1) // some kind of fatal TCP error
/* Thread external API ----------------------------------------------------- */
void bgp_writes_on(struct peer *peer)
{
struct frr_pthread *fpt = bgp_pth_io;
assert(fpt->running);
assert(peer->status != Deleted);
assert(peer->obuf);
assert(peer->ibuf);
assert(peer->ibuf_work);
assert(!peer->t_connect_check_r);
assert(!peer->t_connect_check_w);
assert(peer->fd);
thread_add_write(fpt->master, bgp_process_writes, peer, peer->fd,
&peer->t_write);
SET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
}
void bgp_writes_off(struct peer *peer)
{
struct frr_pthread *fpt = bgp_pth_io;
assert(fpt->running);
thread_cancel_async(fpt->master, &peer->t_write, NULL);
THREAD_OFF(peer->t_generate_updgrp_packets);
UNSET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
}
void bgp_reads_on(struct peer *peer)
{
struct frr_pthread *fpt = bgp_pth_io;
assert(fpt->running);
assert(peer->status != Deleted);
assert(peer->ibuf);
assert(peer->fd);
assert(peer->ibuf_work);
assert(peer->obuf);
assert(!peer->t_connect_check_r);
assert(!peer->t_connect_check_w);
assert(peer->fd);
thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
&peer->t_read);
SET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
}
void bgp_reads_off(struct peer *peer)
{
struct frr_pthread *fpt = bgp_pth_io;
assert(fpt->running);
thread_cancel_async(fpt->master, &peer->t_read, NULL);
THREAD_OFF(peer->t_process_packet);
UNSET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
}
/* Thread internal functions ----------------------------------------------- */
/*
* Called from I/O pthread when a file descriptor has become ready for writing.
*/
static int bgp_process_writes(struct thread *thread)
{
static struct peer *peer;
peer = THREAD_ARG(thread);
uint16_t status;
bool reschedule;
bool fatal = false;
if (peer->fd < 0)
return -1;
struct frr_pthread *fpt = bgp_pth_io;
frr_with_mutex(&peer->io_mtx) {
status = bgp_write(peer);
reschedule = (stream_fifo_head(peer->obuf) != NULL);
}
/* no problem */
if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
}
/* problem */
if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
reschedule = false;
fatal = true;
}
/* If suppress fib pending is enabled, route is advertised to peers when
* the status is received from the FIB. The delay is added
* to update group packet generate which will allow more routes to be
* sent in the update message
*/
if (reschedule) {
thread_add_write(fpt->master, bgp_process_writes, peer,
peer->fd, &peer->t_write);
} else if (!fatal) {
BGP_UPDATE_GROUP_TIMER_ON(&peer->t_generate_updgrp_packets,
bgp_generate_updgrp_packets);
}
return 0;
}
/*
* Called from I/O pthread when a file descriptor has become ready for reading,
* or has hung up.
*
* We read as much data as possible, process as many packets as we can and
* place them on peer->ibuf for secondary processing by the main thread.
*/
static int bgp_process_reads(struct thread *thread)
{
/* clang-format off */
static struct peer *peer; // peer to read from
uint16_t status; // bgp_read status code
bool more = true; // whether we got more data
bool fatal = false; // whether fatal error occurred
bool added_pkt = false; // whether we pushed onto ->ibuf
/* clang-format on */
peer = THREAD_ARG(thread);
if (peer->fd < 0 || bm->terminating)
return -1;
struct frr_pthread *fpt = bgp_pth_io;
frr_with_mutex(&peer->io_mtx) {
status = bgp_read(peer);
}
/* error checking phase */
if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
/* no problem; just don't process packets */
more = false;
}
if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
/* problem; tear down session */
more = false;
fatal = true;
}
while (more) {
/* static buffer for transferring packets */
/* shorter alias to peer's input buffer */
struct ringbuf *ibw = peer->ibuf_work;
/* packet size as given by header */
uint16_t pktsize = 0;
/* check that we have enough data for a header */
if (ringbuf_remain(ibw) < BGP_HEADER_SIZE)
break;
/* check that header is valid */
if (!validate_header(peer)) {
fatal = true;
break;
}
/* header is valid; retrieve packet size */
ringbuf_peek(ibw, BGP_MARKER_SIZE, &pktsize, sizeof(pktsize));
pktsize = ntohs(pktsize);
/* if this fails we are seriously screwed */
bgpd: Add BGP Extended message support Implement https://www.rfc-editor.org/rfc/rfc8654.txt ``` > | jq '."192.168.10.25".neighborCapabilities.extendedMessage' "advertisedAndReceived" ``` Another side is Bird: ``` BIRD 2.0.7 ready. Name Proto Table State Since Info v4 BGP --- up 19:39:15.689 Established BGP state: Established Neighbor address: 192.168.10.123 Neighbor AS: 65534 Local AS: 65025 Neighbor ID: 192.168.100.1 Local capabilities Multiprotocol AF announced: ipv4 Route refresh Extended message Graceful restart 4-octet AS numbers Enhanced refresh Long-lived graceful restart Neighbor capabilities Multiprotocol AF announced: ipv4 Route refresh Extended message Graceful restart 4-octet AS numbers ADD-PATH RX: ipv4 TX: Enhanced refresh Session: external AS4 Source address: 192.168.10.25 Hold timer: 140.139/180 Keepalive timer: 9.484/60 Channel ipv4 State: UP Table: master4 Preference: 100 Input filter: ACCEPT Output filter: ACCEPT Routes: 9 imported, 3 exported, 8 preferred Route change stats: received rejected filtered ignored accepted Import updates: 9 0 0 0 9 Import withdraws: 2 0 --- 2 0 Export updates: 11 8 0 --- 3 Export withdraws: 0 --- --- --- 0 BGP Next hop: 192.168.10.25 ``` Tested at least as well with to make sure it works with backward compat.: ExaBGP 4.0.2-1c737d99. Arista vEOS 4.21.14M Testing by injecint 10k routes with: ``` sharp install routes 172.16.0.1 nexthop 192.168.10.123 10000 ``` Before extended message support: ``` 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 2186 (max message len: 4096) numpfx 427 2021/03/01 07:18:53 BGP: u1:s1 send UPDATE len 3421 (max message len: 4096) numpfx 674 ``` After extended message support: ``` 2021/03/01 07:20:11 BGP: u1:s1 send UPDATE len 50051 (max message len: 65535) numpfx 10000 ``` Signed-off-by: Donatas Abraitis <donatas.abraitis@gmail.com>
2021-02-25 18:46:49 +01:00
assert(pktsize <= peer->max_packet_size);
/*
* If we have that much data, chuck it into its own
* stream and append to input queue for processing.
*/
if (ringbuf_remain(ibw) >= pktsize) {
struct stream *pkt = stream_new(pktsize);
assert(STREAM_WRITEABLE(pkt) == pktsize);
assert(ringbuf_get(ibw, pkt->data, pktsize) == pktsize);
stream_set_endp(pkt, pktsize);
frrtrace(2, frr_bgp, packet_read, peer, pkt);
frr_with_mutex(&peer->io_mtx) {
stream_fifo_push(peer->ibuf, pkt);
}
added_pkt = true;
} else
break;
}
/* handle invalid header */
if (fatal) {
/* wipe buffer just in case someone screwed up */
ringbuf_wipe(peer->ibuf_work);
} else {
bgpd: Add BGP Extended message support Implement https://www.rfc-editor.org/rfc/rfc8654.txt ``` > | jq '."192.168.10.25".neighborCapabilities.extendedMessage' "advertisedAndReceived" ``` Another side is Bird: ``` BIRD 2.0.7 ready. Name Proto Table State Since Info v4 BGP --- up 19:39:15.689 Established BGP state: Established Neighbor address: 192.168.10.123 Neighbor AS: 65534 Local AS: 65025 Neighbor ID: 192.168.100.1 Local capabilities Multiprotocol AF announced: ipv4 Route refresh Extended message Graceful restart 4-octet AS numbers Enhanced refresh Long-lived graceful restart Neighbor capabilities Multiprotocol AF announced: ipv4 Route refresh Extended message Graceful restart 4-octet AS numbers ADD-PATH RX: ipv4 TX: Enhanced refresh Session: external AS4 Source address: 192.168.10.25 Hold timer: 140.139/180 Keepalive timer: 9.484/60 Channel ipv4 State: UP Table: master4 Preference: 100 Input filter: ACCEPT Output filter: ACCEPT Routes: 9 imported, 3 exported, 8 preferred Route change stats: received rejected filtered ignored accepted Import updates: 9 0 0 0 9 Import withdraws: 2 0 --- 2 0 Export updates: 11 8 0 --- 3 Export withdraws: 0 --- --- --- 0 BGP Next hop: 192.168.10.25 ``` Tested at least as well with to make sure it works with backward compat.: ExaBGP 4.0.2-1c737d99. Arista vEOS 4.21.14M Testing by injecint 10k routes with: ``` sharp install routes 172.16.0.1 nexthop 192.168.10.123 10000 ``` Before extended message support: ``` 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 2186 (max message len: 4096) numpfx 427 2021/03/01 07:18:53 BGP: u1:s1 send UPDATE len 3421 (max message len: 4096) numpfx 674 ``` After extended message support: ``` 2021/03/01 07:20:11 BGP: u1:s1 send UPDATE len 50051 (max message len: 65535) numpfx 10000 ``` Signed-off-by: Donatas Abraitis <donatas.abraitis@gmail.com>
2021-02-25 18:46:49 +01:00
assert(ringbuf_space(peer->ibuf_work) >= peer->max_packet_size);
thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
&peer->t_read);
if (added_pkt)
bgpd: schedule packet job after connection xfer During initial session establishment, bgpd performs a "connection transfer" to a new peer struct if the connection was initiated passively (i.e. by the remote peer). With the addition of buffered input and a reorganized packet processor, the following race condition manifests: 1. Remote peer initiates a connection. After exchanging OPEN messages, we send them a KEEPALIVE. They send us a KEEPALIVE followed by 10,000 UPDATE messages. The I/O thread pushes these onto our local peer's input buffer and schedules a packet processing job on the main thread. 2. The packet job runs and processes the KEEPALIVE, which completes the handshake on our end. As part of transferring to ESTABLISHED we transfer all peer state to a new struct, as mentioned. Upon returning from the KEEPALIVE processing routing, the peer context we had has now been destroyed. We notice this and stop processing. Meanwhile 10k UPDATE messages are sitting on the input buffer. 3. N seconds later, the remote peer sends us a KEEPALIVE. The I/O thread schedules another process job, which finds 10k UPDATEs waiting for it. Convergence is achieved, but has been delayed by the value of the KEEPALIVE timer. The racey part is that if the remote peer takes a little bit of time to send UPDATEs after KEEPALIVEs -- somewhere on the order of a few hundred milliseconds -- we complete the transfer successfully and the packet processing job is scheduled on the new peer upon arrival of the UPDATE messages. Yuck. The solution is to schedule a packet processing job on the new peer struct after transferring state. Lengthy commit message in case someone has to debug similar problems in the future... Signed-off-by: Quentin Young <qlyoung@cumulusnetworks.com>
2017-11-06 06:33:46 +01:00
thread_add_timer_msec(bm->master, bgp_process_packet,
peer, 0, &peer->t_process_packet);
}
return 0;
}
/*
* Flush peer output buffer.
*
* This function pops packets off of peer->obuf and writes them to peer->fd.
* The amount of packets written is equal to the minimum of peer->wpkt_quanta
* and the number of packets on the output buffer, unless an error occurs.
*
* If write() returns an error, the appropriate FSM event is generated.
*
* The return value is equal to the number of packets written
* (which may be zero).
*/
static uint16_t bgp_write(struct peer *peer)
{
uint8_t type;
struct stream *s;
int update_last_write = 0;
unsigned int count;
uint32_t uo = 0;
uint16_t status = 0;
uint32_t wpkt_quanta_old;
int writenum = 0;
int num;
unsigned int iovsz;
unsigned int strmsz;
unsigned int total_written;
wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta,
memory_order_relaxed);
struct stream *ostreams[wpkt_quanta_old];
struct stream **streams = ostreams;
struct iovec iov[wpkt_quanta_old];
s = stream_fifo_head(peer->obuf);
if (!s)
goto done;
count = iovsz = 0;
while (count < wpkt_quanta_old && iovsz < array_size(iov) && s) {
ostreams[iovsz] = s;
iov[iovsz].iov_base = stream_pnt(s);
iov[iovsz].iov_len = STREAM_READABLE(s);
writenum += STREAM_READABLE(s);
s = s->next;
++iovsz;
++count;
}
strmsz = iovsz;
total_written = 0;
do {
num = writev(peer->fd, iov, iovsz);
if (num < 0) {
if (!ERRNO_IO_RETRY(errno)) {
BGP_EVENT_ADD(peer, TCP_fatal_error);
SET_FLAG(status, BGP_IO_FATAL_ERR);
} else {
SET_FLAG(status, BGP_IO_TRANS_ERR);
}
break;
} else if (num != writenum) {
unsigned int msg_written = 0;
unsigned int ic = iovsz;
for (unsigned int i = 0; i < ic; i++) {
size_t ss = iov[i].iov_len;
if (ss > (unsigned int) num)
break;
msg_written++;
iovsz--;
writenum -= ss;
num -= ss;
}
total_written += msg_written;
assert(total_written < count);
memmove(&iov, &iov[msg_written],
sizeof(iov[0]) * iovsz);
streams = &streams[msg_written];
stream_forward_getp(streams[0], num);
iov[0].iov_base = stream_pnt(streams[0]);
iov[0].iov_len = STREAM_READABLE(streams[0]);
writenum -= num;
num = 0;
assert(writenum > 0);
} else {
total_written = strmsz;
}
} while (num != writenum);
/* Handle statistics */
for (unsigned int i = 0; i < total_written; i++) {
s = stream_fifo_pop(peer->obuf);
assert(s == ostreams[i]);
/* Retrieve BGP packet type. */
stream_set_getp(s, BGP_MARKER_SIZE + 2);
type = stream_getc(s);
switch (type) {
case BGP_MSG_OPEN:
atomic_fetch_add_explicit(&peer->open_out, 1,
memory_order_relaxed);
break;
case BGP_MSG_UPDATE:
atomic_fetch_add_explicit(&peer->update_out, 1,
memory_order_relaxed);
uo++;
break;
case BGP_MSG_NOTIFY:
atomic_fetch_add_explicit(&peer->notify_out, 1,
memory_order_relaxed);
/* Double start timer. */
peer->v_start *= 2;
/* Overflow check. */
if (peer->v_start >= (60 * 2))
peer->v_start = (60 * 2);
/*
* Handle Graceful Restart case where the state changes
* to Connect instead of Idle.
*/
BGP_EVENT_ADD(peer, BGP_Stop);
goto done;
case BGP_MSG_KEEPALIVE:
atomic_fetch_add_explicit(&peer->keepalive_out, 1,
memory_order_relaxed);
break;
case BGP_MSG_ROUTE_REFRESH_NEW:
case BGP_MSG_ROUTE_REFRESH_OLD:
atomic_fetch_add_explicit(&peer->refresh_out, 1,
memory_order_relaxed);
break;
case BGP_MSG_CAPABILITY:
atomic_fetch_add_explicit(&peer->dynamic_cap_out, 1,
memory_order_relaxed);
break;
}
stream_free(s);
ostreams[i] = NULL;
update_last_write = 1;
}
done : {
/*
* Update last_update if UPDATEs were written.
* Note: that these are only updated at end,
* not per message (i.e., per loop)
*/
if (uo)
atomic_store_explicit(&peer->last_update, bgp_clock(),
memory_order_relaxed);
/* If we TXed any flavor of packet */
if (update_last_write)
atomic_store_explicit(&peer->last_write, bgp_clock(),
memory_order_relaxed);
}
return status;
}
/*
* Reads a chunk of data from peer->fd into peer->ibuf_work.
*
* @return status flag (see top-of-file)
*/
static uint16_t bgp_read(struct peer *peer)
{
size_t readsize; // how many bytes we want to read
ssize_t nbytes; // how many bytes we actually read
uint16_t status = 0;
bgpd: Add BGP Extended message support Implement https://www.rfc-editor.org/rfc/rfc8654.txt ``` > | jq '."192.168.10.25".neighborCapabilities.extendedMessage' "advertisedAndReceived" ``` Another side is Bird: ``` BIRD 2.0.7 ready. Name Proto Table State Since Info v4 BGP --- up 19:39:15.689 Established BGP state: Established Neighbor address: 192.168.10.123 Neighbor AS: 65534 Local AS: 65025 Neighbor ID: 192.168.100.1 Local capabilities Multiprotocol AF announced: ipv4 Route refresh Extended message Graceful restart 4-octet AS numbers Enhanced refresh Long-lived graceful restart Neighbor capabilities Multiprotocol AF announced: ipv4 Route refresh Extended message Graceful restart 4-octet AS numbers ADD-PATH RX: ipv4 TX: Enhanced refresh Session: external AS4 Source address: 192.168.10.25 Hold timer: 140.139/180 Keepalive timer: 9.484/60 Channel ipv4 State: UP Table: master4 Preference: 100 Input filter: ACCEPT Output filter: ACCEPT Routes: 9 imported, 3 exported, 8 preferred Route change stats: received rejected filtered ignored accepted Import updates: 9 0 0 0 9 Import withdraws: 2 0 --- 2 0 Export updates: 11 8 0 --- 3 Export withdraws: 0 --- --- --- 0 BGP Next hop: 192.168.10.25 ``` Tested at least as well with to make sure it works with backward compat.: ExaBGP 4.0.2-1c737d99. Arista vEOS 4.21.14M Testing by injecint 10k routes with: ``` sharp install routes 172.16.0.1 nexthop 192.168.10.123 10000 ``` Before extended message support: ``` 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 2186 (max message len: 4096) numpfx 427 2021/03/01 07:18:53 BGP: u1:s1 send UPDATE len 3421 (max message len: 4096) numpfx 674 ``` After extended message support: ``` 2021/03/01 07:20:11 BGP: u1:s1 send UPDATE len 50051 (max message len: 65535) numpfx 10000 ``` Signed-off-by: Donatas Abraitis <donatas.abraitis@gmail.com>
2021-02-25 18:46:49 +01:00
uint8_t ibw[peer->max_packet_size * BGP_READ_PACKET_MAX];
readsize = MIN(ringbuf_space(peer->ibuf_work), sizeof(ibw));
nbytes = read(peer->fd, ibw, readsize);
/* EAGAIN or EWOULDBLOCK; come back later */
if (nbytes < 0 && ERRNO_IO_RETRY(errno)) {
SET_FLAG(status, BGP_IO_TRANS_ERR);
/* Fatal error; tear down session */
} else if (nbytes < 0) {
flog_err(EC_BGP_UPDATE_RCV,
"%s [Error] bgp_read_packet error: %s", peer->host,
safe_strerror(errno));
if (peer->status == Established) {
if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART)
|| CHECK_FLAG(peer->flags,
PEER_FLAG_GRACEFUL_RESTART_HELPER))
&& CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
} else
peer->last_reset = PEER_DOWN_CLOSE_SESSION;
}
BGP_EVENT_ADD(peer, TCP_fatal_error);
SET_FLAG(status, BGP_IO_FATAL_ERR);
/* Received EOF / TCP session closed */
} else if (nbytes == 0) {
if (bgp_debug_neighbor_events(peer))
zlog_debug("%s [Event] BGP connection closed fd %d",
peer->host, peer->fd);
if (peer->status == Established) {
if ((CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART)
|| CHECK_FLAG(peer->flags,
PEER_FLAG_GRACEFUL_RESTART_HELPER))
&& CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
} else
peer->last_reset = PEER_DOWN_CLOSE_SESSION;
}
BGP_EVENT_ADD(peer, TCP_connection_closed);
SET_FLAG(status, BGP_IO_FATAL_ERR);
} else {
assert(ringbuf_put(peer->ibuf_work, ibw, nbytes)
== (size_t)nbytes);
}
return status;
}
/*
* Called after we have read a BGP packet header. Validates marker, message
* type and packet length. If any of these aren't correct, sends a notify.
*
* Assumes that there are at least BGP_HEADER_SIZE readable bytes in the input
* buffer.
*/
static bool validate_header(struct peer *peer)
{
uint16_t size;
uint8_t type;
struct ringbuf *pkt = peer->ibuf_work;
static const uint8_t m_correct[BGP_MARKER_SIZE] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
uint8_t m_rx[BGP_MARKER_SIZE] = {0x00};
if (ringbuf_peek(pkt, 0, m_rx, BGP_MARKER_SIZE) != BGP_MARKER_SIZE)
return false;
if (memcmp(m_correct, m_rx, BGP_MARKER_SIZE) != 0) {
bgp_notify_send(peer, BGP_NOTIFY_HEADER_ERR,
BGP_NOTIFY_HEADER_NOT_SYNC);
return false;
}
/* Get size and type in network byte order. */
ringbuf_peek(pkt, BGP_MARKER_SIZE, &size, sizeof(size));
ringbuf_peek(pkt, BGP_MARKER_SIZE + 2, &type, sizeof(type));
size = ntohs(size);
/* BGP type check. */
if (type != BGP_MSG_OPEN && type != BGP_MSG_UPDATE
&& type != BGP_MSG_NOTIFY && type != BGP_MSG_KEEPALIVE
&& type != BGP_MSG_ROUTE_REFRESH_NEW
&& type != BGP_MSG_ROUTE_REFRESH_OLD
&& type != BGP_MSG_CAPABILITY) {
if (bgp_debug_neighbor_events(peer))
zlog_debug("%s unknown message type 0x%02x", peer->host,
type);
bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
BGP_NOTIFY_HEADER_BAD_MESTYPE, &type,
1);
return false;
}
/* Minimum packet length check. */
bgpd: Add BGP Extended message support Implement https://www.rfc-editor.org/rfc/rfc8654.txt ``` > | jq '."192.168.10.25".neighborCapabilities.extendedMessage' "advertisedAndReceived" ``` Another side is Bird: ``` BIRD 2.0.7 ready. Name Proto Table State Since Info v4 BGP --- up 19:39:15.689 Established BGP state: Established Neighbor address: 192.168.10.123 Neighbor AS: 65534 Local AS: 65025 Neighbor ID: 192.168.100.1 Local capabilities Multiprotocol AF announced: ipv4 Route refresh Extended message Graceful restart 4-octet AS numbers Enhanced refresh Long-lived graceful restart Neighbor capabilities Multiprotocol AF announced: ipv4 Route refresh Extended message Graceful restart 4-octet AS numbers ADD-PATH RX: ipv4 TX: Enhanced refresh Session: external AS4 Source address: 192.168.10.25 Hold timer: 140.139/180 Keepalive timer: 9.484/60 Channel ipv4 State: UP Table: master4 Preference: 100 Input filter: ACCEPT Output filter: ACCEPT Routes: 9 imported, 3 exported, 8 preferred Route change stats: received rejected filtered ignored accepted Import updates: 9 0 0 0 9 Import withdraws: 2 0 --- 2 0 Export updates: 11 8 0 --- 3 Export withdraws: 0 --- --- --- 0 BGP Next hop: 192.168.10.25 ``` Tested at least as well with to make sure it works with backward compat.: ExaBGP 4.0.2-1c737d99. Arista vEOS 4.21.14M Testing by injecint 10k routes with: ``` sharp install routes 172.16.0.1 nexthop 192.168.10.123 10000 ``` Before extended message support: ``` 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:51 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 4096 (max message len: 4096) numpfx 809 2021/03/01 07:18:52 BGP: u1:s1 send UPDATE len 2186 (max message len: 4096) numpfx 427 2021/03/01 07:18:53 BGP: u1:s1 send UPDATE len 3421 (max message len: 4096) numpfx 674 ``` After extended message support: ``` 2021/03/01 07:20:11 BGP: u1:s1 send UPDATE len 50051 (max message len: 65535) numpfx 10000 ``` Signed-off-by: Donatas Abraitis <donatas.abraitis@gmail.com>
2021-02-25 18:46:49 +01:00
if ((size < BGP_HEADER_SIZE) || (size > peer->max_packet_size)
|| (type == BGP_MSG_OPEN && size < BGP_MSG_OPEN_MIN_SIZE)
|| (type == BGP_MSG_UPDATE && size < BGP_MSG_UPDATE_MIN_SIZE)
|| (type == BGP_MSG_NOTIFY && size < BGP_MSG_NOTIFY_MIN_SIZE)
|| (type == BGP_MSG_KEEPALIVE && size != BGP_MSG_KEEPALIVE_MIN_SIZE)
|| (type == BGP_MSG_ROUTE_REFRESH_NEW
&& size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
|| (type == BGP_MSG_ROUTE_REFRESH_OLD
&& size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
|| (type == BGP_MSG_CAPABILITY
&& size < BGP_MSG_CAPABILITY_MIN_SIZE)) {
if (bgp_debug_neighbor_events(peer)) {
zlog_debug("%s bad message length - %d for %s",
peer->host, size,
type == 128 ? "ROUTE-REFRESH"
: bgp_type_str[(int)type]);
}
uint16_t nsize = htons(size);
bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
BGP_NOTIFY_HEADER_BAD_MESLEN,
(unsigned char *)&nsize, 2);
return false;
}
return true;
}