frr/bfdd/bfd_packet.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2572 lines
66 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*********************************************************************
* Copyright 2017 Cumulus Networks, Inc. All rights reserved.
*
* bfd_packet.c: implements the BFD protocol packet handling.
*
* Authors
* -------
* Shrijeet Mukherjee [shm@cumulusnetworks.com]
* Kanna Rajagopal [kanna@cumulusnetworks.com]
* Radhika Mahankali [Radhika@cumulusnetworks.com]
*/
#include <zebra.h>
#include <sys/ioctl.h>
#ifdef GNU_LINUX
#include <linux/filter.h>
#include <linux/seg6.h>
#include <linux/ipv6.h>
#endif
#ifdef BFD_LINUX
#include <linux/if_packet.h>
#include <linux/seg6.h>
#include <linux/ipv6.h>
#endif /* BFD_LINUX */
#include <netinet/if_ether.h>
#include <netinet/udp.h>
#include <netinet/ip6.h>
#include <ifaddrs.h>
#include "lib/sockopt.h"
#include "lib/checksum.h"
#include "lib/network.h"
#include "bfd.h"
#define BUF_SIZ 1024
#define SOCK_OPT_PRIO_HIGH 6
/*
* Prototypes
*/
static int ptm_bfd_process_echo_pkt(struct bfd_vrf_global *bvrf, int s);
int _ptm_bfd_send(struct bfd_session *bs, uint16_t *port, const void *data,
size_t datalen);
static void bfd_sd_reschedule(struct bfd_vrf_global *bvrf, int sd);
ssize_t bfd_recv_ipv4(int sd, uint8_t *msgbuf, size_t msgbuflen, uint8_t *ttl,
ifindex_t *ifindex, struct sockaddr_any *local,
struct sockaddr_any *peer);
ssize_t bfd_recv_ipv6(int sd, uint8_t *msgbuf, size_t msgbuflen, uint8_t *ttl,
ifindex_t *ifindex, struct sockaddr_any *local,
struct sockaddr_any *peer);
int bp_udp_send(int sd, uint8_t ttl, uint8_t *data, size_t datalen,
struct sockaddr *to, socklen_t tolen);
int bp_bfd_echo_in(struct bfd_vrf_global *bvrf, int sd, uint8_t *ttl,
uint32_t *my_discr, uint64_t *my_rtt);
static int ptm_bfd_reflector_process_init_packet(struct bfd_vrf_global *bvrf, int s);
int _ptm_sbfd_init_send(struct bfd_session *bs, const void *data, size_t datalen);
#ifdef BFD_LINUX
static int bp_raw_sbfd_red_send(int sd, uint8_t *data, size_t datalen, uint16_t family,
struct in6_addr *out_sip, struct in6_addr *sip,
struct in6_addr *dip, uint16_t src_port, uint16_t dst_port,
uint8_t seg_num, struct in6_addr *segment_list);
static ssize_t bfd_recv_ipv4_fp(int sd, uint8_t *msgbuf, size_t msgbuflen, uint8_t *ttl,
ifindex_t *ifindex, struct sockaddr_any *local,
struct sockaddr_any *peer);
static void bfd_peer_mac_set(int sd, struct bfd_session *bfd, struct sockaddr_any *peer,
struct interface *ifp);
int bp_udp_send_fp(int sd, uint8_t *data, size_t datalen,
struct bfd_session *bfd);
ssize_t bfd_recv_fp_echo(int sd, uint8_t *msgbuf, size_t msgbuflen,
uint8_t *ttl, ifindex_t *ifindex,
struct sockaddr_any *local, struct sockaddr_any *peer);
#endif
/* socket related prototypes */
static void bp_set_ipopts(int sd);
static void bp_bind_ip(int sd, uint16_t port);
static void bp_set_ipv6opts(int sd);
static void bp_bind_ipv6(int sd, uint16_t port);
/*
* Functions
*/
int _ptm_bfd_send(struct bfd_session *bs, uint16_t *port, const void *data,
size_t datalen)
{
struct sockaddr *sa;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
socklen_t slen;
ssize_t rv;
int sd = -1;
if (CHECK_FLAG(bs->flags, BFD_SESS_FLAG_IPV6)) {
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
memcpy(&sin6.sin6_addr, &bs->key.peer, sizeof(sin6.sin6_addr));
if (bs->ifp && IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr))
sin6.sin6_scope_id = bs->ifp->ifindex;
sin6.sin6_port =
(port) ? *port
: (CHECK_FLAG(bs->flags, BFD_SESS_FLAG_MH))
? htons(BFD_DEF_MHOP_DEST_PORT)
: htons(BFD_DEFDESTPORT);
sd = bs->sock;
sa = (struct sockaddr *)&sin6;
slen = sizeof(sin6);
} else {
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
memcpy(&sin.sin_addr, &bs->key.peer, sizeof(sin.sin_addr));
sin.sin_port =
(port) ? *port
: (CHECK_FLAG(bs->flags, BFD_SESS_FLAG_MH))
? htons(BFD_DEF_MHOP_DEST_PORT)
: htons(BFD_DEFDESTPORT);
sd = bs->sock;
sa = (struct sockaddr *)&sin;
slen = sizeof(sin);
}
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
sa->sa_len = slen;
#endif /* HAVE_STRUCT_SOCKADDR_SA_LEN */
rv = sendto(sd, data, datalen, 0, sa, slen);
if (rv <= 0) {
if (bglobal.debug_network)
zlog_debug("packet-send: send failure: %s",
strerror(errno));
return -1;
}
if (rv < (ssize_t)datalen) {
if (bglobal.debug_network)
zlog_debug("packet-send: send partial: %s",
strerror(errno));
}
return 0;
}
#ifdef BFD_LINUX
/*
* Compute the UDP checksum.
*
* Checksum is not set in the packet, just computed.
*
* pkt
* Packet, fully filled out except for checksum field.
*
* pktsize
* sizeof(*pkt)
*
* ip
* IP address that pkt will be transmitted from and to.
*
* Returns:
* Checksum in network byte order.
*/
static uint16_t bfd_pkt_checksum(struct udphdr *pkt, size_t pktsize,
struct in6_addr *ip, sa_family_t family)
{
uint16_t chksum;
pkt->check = 0;
if (family == AF_INET6) {
struct ipv6_ph ph = {};
memcpy(&ph.src, ip, sizeof(ph.src));
memcpy(&ph.dst, ip, sizeof(ph.dst));
ph.ulpl = htons(pktsize);
ph.next_hdr = IPPROTO_UDP;
chksum = in_cksum_with_ph6(&ph, pkt, pktsize);
} else {
struct ipv4_ph ph = {};
memcpy(&ph.src, ip, sizeof(ph.src));
memcpy(&ph.dst, ip, sizeof(ph.dst));
ph.proto = IPPROTO_UDP;
ph.len = htons(pktsize);
chksum = in_cksum_with_ph4(&ph, pkt, pktsize);
}
return chksum;
}
/*
* This routine creates the entire ECHO packet so that it will be looped
* in the forwarding plane of the peer router instead of going up the
* stack in BFD to be looped. If we haven't learned the peers MAC yet
* no echo is sent.
*
* echo packet with src/dst IP equal to local IP
* dest MAC as peer's MAC
*
* currently support ipv4
*/
void ptm_bfd_echo_fp_snd(struct bfd_session *bfd)
{
int sd;
struct bfd_vrf_global *bvrf = bfd_vrf_look_by_session(bfd);
int total_len = 0;
struct ethhdr *eth;
struct udphdr *uh;
struct iphdr *iph;
struct bfd_echo_pkt *beph;
static char sendbuff[100];
struct timeval time_sent;
if (!bvrf)
return;
if (!CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_MAC_SET))
return;
if (!CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_ECHO_ACTIVE))
SET_FLAG(bfd->flags, BFD_SESS_FLAG_ECHO_ACTIVE);
memset(sendbuff, 0, sizeof(sendbuff));
/* add eth hdr */
eth = (struct ethhdr *)(sendbuff);
memcpy(eth->h_source, bfd->ifp->hw_addr, sizeof(eth->h_source));
memcpy(eth->h_dest, bfd->peer_hw_addr, sizeof(eth->h_dest));
total_len += sizeof(struct ethhdr);
sd = bvrf->bg_echo;
eth->h_proto = htons(ETH_P_IP);
/* add ip hdr */
iph = (struct iphdr *)(sendbuff + sizeof(struct ethhdr));
iph->ihl = sizeof(struct ip) >> 2;
iph->version = IPVERSION;
iph->tos = IPTOS_PREC_INTERNETCONTROL;
iph->id = (uint16_t)frr_weak_random();
iph->ttl = BFD_TTL_VAL;
iph->protocol = IPPROTO_UDP;
memcpy(&iph->saddr, &bfd->local_address.sa_sin.sin_addr,
sizeof(bfd->local_address.sa_sin.sin_addr));
memcpy(&iph->daddr, &bfd->local_address.sa_sin.sin_addr,
sizeof(bfd->local_address.sa_sin.sin_addr));
total_len += sizeof(struct iphdr);
/* add udp hdr */
uh = (struct udphdr *)(sendbuff + sizeof(struct iphdr) +
sizeof(struct ethhdr));
uh->source = htons(BFD_DEF_ECHO_PORT);
uh->dest = htons(BFD_DEF_ECHO_PORT);
total_len += sizeof(struct udphdr);
/* add bfd echo */
beph = (struct bfd_echo_pkt *)(sendbuff + sizeof(struct udphdr) +
sizeof(struct iphdr) +
sizeof(struct ethhdr));
beph->ver = BFD_ECHO_VERSION;
beph->len = BFD_ECHO_PKT_LEN;
beph->my_discr = htonl(bfd->discrs.my_discr);
/* RTT calculation: add starting time in packet */
monotime(&time_sent);
beph->time_sent_sec = htobe64(time_sent.tv_sec);
beph->time_sent_usec = htobe64(time_sent.tv_usec);
total_len += sizeof(struct bfd_echo_pkt);
uh->len =
htons(total_len - sizeof(struct iphdr) - sizeof(struct ethhdr));
uh->check = bfd_pkt_checksum(
uh, (total_len - sizeof(struct iphdr) - sizeof(struct ethhdr)),
(struct in6_addr *)&iph->saddr, AF_INET);
iph->tot_len = htons(total_len - sizeof(struct ethhdr));
iph->check = in_cksum((const void *)iph, sizeof(struct iphdr));
if (bp_udp_send_fp(sd, (uint8_t *)&sendbuff, total_len, bfd) == -1)
return;
bfd->stats.tx_echo_pkt++;
}
#endif
void ptm_bfd_echo_snd(struct bfd_session *bfd)
{
struct sockaddr *sa;
socklen_t salen;
int sd;
struct bfd_echo_pkt bep;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
struct bfd_vrf_global *bvrf = bfd_vrf_look_by_session(bfd);
if (!bvrf)
return;
if (!CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_ECHO_ACTIVE))
SET_FLAG(bfd->flags, BFD_SESS_FLAG_ECHO_ACTIVE);
memset(&bep, 0, sizeof(bep));
bep.ver = BFD_ECHO_VERSION;
bep.len = BFD_ECHO_PKT_LEN;
bep.my_discr = htonl(bfd->discrs.my_discr);
if (CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_IPV6)) {
if (bvrf->bg_echov6 == -1)
return;
sd = bvrf->bg_echov6;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
memcpy(&sin6.sin6_addr, &bfd->key.peer, sizeof(sin6.sin6_addr));
if (bfd->ifp && IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr))
sin6.sin6_scope_id = bfd->ifp->ifindex;
sin6.sin6_port = htons(BFD_DEF_ECHO_PORT);
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
sin6.sin6_len = sizeof(sin6);
#endif /* HAVE_STRUCT_SOCKADDR_SA_LEN */
sa = (struct sockaddr *)&sin6;
salen = sizeof(sin6);
} else {
sd = bvrf->bg_echo;
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
memcpy(&sin.sin_addr, &bfd->key.peer, sizeof(sin.sin_addr));
sin.sin_port = htons(BFD_DEF_ECHO_PORT);
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
sin.sin_len = sizeof(sin);
#endif /* HAVE_STRUCT_SOCKADDR_SA_LEN */
sa = (struct sockaddr *)&sin;
salen = sizeof(sin);
}
if (bp_udp_send(sd, BFD_TTL_VAL, (uint8_t *)&bep, sizeof(bep), sa,
salen)
== -1)
return;
bfd->stats.tx_echo_pkt++;
}
static int ptm_bfd_process_echo_pkt(struct bfd_vrf_global *bvrf, int s)
{
struct bfd_session *bfd;
uint32_t my_discr = 0;
uint64_t my_rtt = 0;
uint8_t ttl = 0;
/* Receive and parse echo packet. */
if (bp_bfd_echo_in(bvrf, s, &ttl, &my_discr, &my_rtt) == -1)
return 0;
/* Your discriminator not zero - use it to find session */
bfd = bfd_id_lookup(my_discr);
if (bfd == NULL) {
if (bglobal.debug_network)
zlog_debug("echo-packet: no matching session (id:%u)",
my_discr);
return -1;
}
if (!CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_ECHO_ACTIVE)) {
if (bglobal.debug_network)
zlog_debug("echo-packet: echo disabled [%s] (id:%u)",
bs_to_string(bfd), my_discr);
return -1;
}
/* RTT Calculation: add current RTT to samples */
if (my_rtt != 0) {
bfd->rtt[bfd->rtt_index] = my_rtt;
bfd->rtt_index++;
if (bfd->rtt_index >= BFD_RTT_SAMPLE)
bfd->rtt_index = 0;
if (bfd->rtt_valid < BFD_RTT_SAMPLE)
bfd->rtt_valid++;
}
bfd->stats.rx_echo_pkt++;
/* Compute detect time */
bfd->echo_detect_TO = bfd->remote_detect_mult * bfd->echo_xmt_TO;
/* Update sbfd-echo session state */
if (bfd->bfd_mode == BFD_MODE_TYPE_SBFD_ECHO) {
sbfd_echo_state_handler(bfd, PTM_BFD_UP);
if (bfd->echo_xmt_TO != bfd->timers.desired_min_echo_tx) {
bfd->echo_xmt_TO = bfd->timers.desired_min_echo_tx;
//reset xmt timer TO after UP
ptm_bfd_start_xmt_timer(bfd, true);
}
bfd->echo_detect_TO = bfd->detect_mult * bfd->echo_xmt_TO;
/* Update sbfd echo receive timeout. */
if (bfd->echo_detect_TO > 0)
sbfd_echo_recvtimer_update(bfd);
return 0;
}
/* Update bfd-echo receive timeout. */
if (bfd->echo_detect_TO > 0)
bfd_echo_recvtimer_update(bfd);
return 0;
}
void ptm_bfd_snd(struct bfd_session *bfd, int fbit)
{
struct bfd_pkt cp = {};
/* Set fields according to section 6.5.7 */
cp.diag = bfd->local_diag;
BFD_SETVER(cp.diag, BFD_VERSION);
cp.flags = 0;
BFD_SETSTATE(cp.flags, bfd->ses_state);
if (CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_CBIT))
BFD_SETCBIT(cp.flags, BFD_CBIT);
BFD_SETDEMANDBIT(cp.flags, BFD_DEF_DEMAND);
/*
* Polling and Final can't be set at the same time.
*
* RFC 5880, Section 6.5.
*/
BFD_SETFBIT(cp.flags, fbit);
if (fbit == 0)
BFD_SETPBIT(cp.flags, bfd->polling);
cp.detect_mult = bfd->detect_mult;
cp.len = BFD_PKT_LEN;
cp.discrs.my_discr = htonl(bfd->discrs.my_discr);
cp.discrs.remote_discr = htonl(bfd->discrs.remote_discr);
if (bfd->polling) {
cp.timers.desired_min_tx =
htonl(bfd->timers.desired_min_tx);
cp.timers.required_min_rx =
htonl(bfd->timers.required_min_rx);
} else {
/*
* We can only announce current setting on poll, this
* avoids timing mismatch with our peer and give it
* the oportunity to learn. See `bs_final_handler` for
* more information.
*/
cp.timers.desired_min_tx =
htonl(bfd->cur_timers.desired_min_tx);
cp.timers.required_min_rx =
htonl(bfd->cur_timers.required_min_rx);
}
cp.timers.required_min_echo = htonl(bfd->timers.required_min_echo_rx);
if (_ptm_bfd_send(bfd, NULL, &cp, BFD_PKT_LEN) != 0)
return;
bfd->stats.tx_ctrl_pkt++;
}
#ifdef BFD_LINUX
/*
* receive the ipv4 echo packet that was loopback in the peers forwarding plane
*/
static ssize_t bfd_recv_ipv4_fp(int sd, uint8_t *msgbuf, size_t msgbuflen, uint8_t *ttl,
ifindex_t *ifindex, struct sockaddr_any *local,
struct sockaddr_any *peer)
{
ssize_t mlen;
struct sockaddr_ll msgaddr;
struct msghdr msghdr;
struct iovec iov[1];
uint16_t recv_checksum;
uint16_t checksum;
struct iphdr *ip;
struct udphdr *uh;
/* Prepare the recvmsg params. */
iov[0].iov_base = msgbuf;
iov[0].iov_len = msgbuflen;
memset(&msghdr, 0, sizeof(msghdr));
msghdr.msg_name = &msgaddr;
msghdr.msg_namelen = sizeof(msgaddr);
msghdr.msg_iov = iov;
msghdr.msg_iovlen = 1;
mlen = recvmsg(sd, &msghdr, MSG_DONTWAIT);
if (mlen == -1) {
if (errno != EAGAIN || errno != EWOULDBLOCK || errno != EINTR)
zlog_err("%s: recv failed: %s", __func__,
strerror(errno));
return -1;
}
ip = (struct iphdr *)(msgbuf + sizeof(struct ethhdr));
/* verify ip checksum */
recv_checksum = ip->check;
ip->check = 0;
checksum = in_cksum((const void *)ip, sizeof(struct iphdr));
if (recv_checksum != checksum) {
if (bglobal.debug_network)
zlog_debug(
"%s: invalid iphdr checksum expected 0x%x rcvd 0x%x",
__func__, checksum, recv_checksum);
return -1;
}
*ttl = ip->ttl;
if (*ttl != 254) {
if (bglobal.debug_network)
zlog_debug("%s: invalid TTL: %u", __func__, *ttl);
return -1;
}
local->sa_sin.sin_family = AF_INET;
memcpy(&local->sa_sin.sin_addr, &ip->saddr, sizeof(ip->saddr));
peer->sa_sin.sin_family = AF_INET;
memcpy(&peer->sa_sin.sin_addr, &ip->daddr, sizeof(ip->daddr));
*ifindex = msgaddr.sll_ifindex;
/* verify udp checksum */
uh = (struct udphdr *)(msgbuf + sizeof(struct iphdr) +
sizeof(struct ethhdr));
recv_checksum = uh->check;
uh->check = 0;
checksum = bfd_pkt_checksum(uh, ntohs(uh->len),
(struct in6_addr *)&ip->saddr, AF_INET);
if (recv_checksum != checksum) {
if (bglobal.debug_network)
zlog_debug(
"%s: invalid udphdr checksum expected 0x%x rcvd 0x%x",
__func__, checksum, recv_checksum);
return -1;
}
return mlen;
}
#endif
ssize_t bfd_recv_ipv4(int sd, uint8_t *msgbuf, size_t msgbuflen, uint8_t *ttl,
ifindex_t *ifindex, struct sockaddr_any *local,
struct sockaddr_any *peer)
{
struct cmsghdr *cm;
ssize_t mlen;
struct sockaddr_in msgaddr;
struct msghdr msghdr;
struct iovec iov[1];
uint8_t cmsgbuf[255];
/* Prepare the recvmsg params. */
iov[0].iov_base = msgbuf;
iov[0].iov_len = msgbuflen;
memset(&msghdr, 0, sizeof(msghdr));
msghdr.msg_name = &msgaddr;
msghdr.msg_namelen = sizeof(msgaddr);
msghdr.msg_iov = iov;
msghdr.msg_iovlen = 1;
msghdr.msg_control = cmsgbuf;
msghdr.msg_controllen = sizeof(cmsgbuf);
mlen = recvmsg(sd, &msghdr, MSG_DONTWAIT);
if (mlen == -1) {
if (errno != EAGAIN)
zlog_err("ipv4-recv: recv failed: %s", strerror(errno));
return -1;
}
/* Get source address */
peer->sa_sin = *((struct sockaddr_in *)(msghdr.msg_name));
/* Get and check TTL */
for (cm = CMSG_FIRSTHDR(&msghdr); cm != NULL;
cm = CMSG_NXTHDR(&msghdr, cm)) {
if (cm->cmsg_level != IPPROTO_IP)
continue;
switch (cm->cmsg_type) {
#ifdef BFD_LINUX
case IP_TTL: {
uint32_t ttlval;
memcpy(&ttlval, CMSG_DATA(cm), sizeof(ttlval));
if (ttlval > 255) {
if (bglobal.debug_network)
zlog_debug("%s: invalid TTL: %u",
__func__, ttlval);
return -1;
}
*ttl = ttlval;
break;
}
case IP_PKTINFO: {
struct in_pktinfo *pi =
(struct in_pktinfo *)CMSG_DATA(cm);
if (pi == NULL)
break;
local->sa_sin.sin_family = AF_INET;
local->sa_sin.sin_addr = pi->ipi_addr;
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
local->sa_sin.sin_len = sizeof(local->sa_sin);
#endif /* HAVE_STRUCT_SOCKADDR_SA_LEN */
*ifindex = pi->ipi_ifindex;
break;
}
#endif /* BFD_LINUX */
#ifdef BFD_BSD
case IP_RECVTTL: {
memcpy(ttl, CMSG_DATA(cm), sizeof(*ttl));
break;
}
case IP_RECVDSTADDR: {
struct in_addr ia;
memcpy(&ia, CMSG_DATA(cm), sizeof(ia));
local->sa_sin.sin_family = AF_INET;
local->sa_sin.sin_addr = ia;
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
local->sa_sin.sin_len = sizeof(local->sa_sin);
#endif /* HAVE_STRUCT_SOCKADDR_SA_LEN */
break;
}
#endif /* BFD_BSD */
default:
/*
* On *BSDs we expect to land here when skipping
* the IP_RECVIF header. It will be handled by
* getsockopt_ifindex() below.
*/
/* NOTHING */
break;
}
}
/* OS agnostic way of getting interface name. */
if (*ifindex == IFINDEX_INTERNAL)
*ifindex = getsockopt_ifindex(AF_INET, &msghdr);
return mlen;
}
ssize_t bfd_recv_ipv6(int sd, uint8_t *msgbuf, size_t msgbuflen, uint8_t *ttl,
ifindex_t *ifindex, struct sockaddr_any *local,
struct sockaddr_any *peer)
{
struct cmsghdr *cm;
struct in6_pktinfo *pi6 = NULL;
ssize_t mlen;
uint32_t ttlval;
struct sockaddr_in6 msgaddr6;
struct msghdr msghdr6;
struct iovec iov[1];
uint8_t cmsgbuf6[255];
/* Prepare the recvmsg params. */
iov[0].iov_base = msgbuf;
iov[0].iov_len = msgbuflen;
memset(&msghdr6, 0, sizeof(msghdr6));
msghdr6.msg_name = &msgaddr6;
msghdr6.msg_namelen = sizeof(msgaddr6);
msghdr6.msg_iov = iov;
msghdr6.msg_iovlen = 1;
msghdr6.msg_control = cmsgbuf6;
msghdr6.msg_controllen = sizeof(cmsgbuf6);
mlen = recvmsg(sd, &msghdr6, MSG_DONTWAIT);
if (mlen == -1) {
if (errno != EAGAIN)
zlog_err("ipv6-recv: recv failed: %s", strerror(errno));
return -1;
}
/* Get source address */
peer->sa_sin6 = *((struct sockaddr_in6 *)(msghdr6.msg_name));
/* Get and check TTL */
for (cm = CMSG_FIRSTHDR(&msghdr6); cm != NULL;
cm = CMSG_NXTHDR(&msghdr6, cm)) {
if (cm->cmsg_level != IPPROTO_IPV6)
continue;
if (cm->cmsg_type == IPV6_HOPLIMIT) {
memcpy(&ttlval, CMSG_DATA(cm), sizeof(ttlval));
if (ttlval > 255) {
if (bglobal.debug_network)
zlog_debug("%s: invalid TTL: %u",
__func__, ttlval);
return -1;
}
*ttl = ttlval;
} else if (cm->cmsg_type == IPV6_PKTINFO) {
pi6 = (struct in6_pktinfo *)CMSG_DATA(cm);
if (pi6) {
local->sa_sin6.sin6_family = AF_INET6;
local->sa_sin6.sin6_addr = pi6->ipi6_addr;
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
local->sa_sin6.sin6_len = sizeof(local->sa_sin6);
#endif /* HAVE_STRUCT_SOCKADDR_SA_LEN */
*ifindex = pi6->ipi6_ifindex;
/* Set scope ID for link local addresses. */
if (IN6_IS_ADDR_LINKLOCAL(
&peer->sa_sin6.sin6_addr))
peer->sa_sin6.sin6_scope_id = *ifindex;
if (IN6_IS_ADDR_LINKLOCAL(
&local->sa_sin6.sin6_addr))
local->sa_sin6.sin6_scope_id = *ifindex;
}
}
}
return mlen;
}
static void bfd_sd_reschedule(struct bfd_vrf_global *bvrf, int sd)
{
if (sd == bvrf->bg_shop) {
EVENT_OFF(bvrf->bg_ev[0]);
event_add_read(master, bfd_recv_cb, bvrf, bvrf->bg_shop,
&bvrf->bg_ev[0]);
} else if (sd == bvrf->bg_mhop) {
EVENT_OFF(bvrf->bg_ev[1]);
event_add_read(master, bfd_recv_cb, bvrf, bvrf->bg_mhop,
&bvrf->bg_ev[1]);
} else if (sd == bvrf->bg_shop6) {
EVENT_OFF(bvrf->bg_ev[2]);
event_add_read(master, bfd_recv_cb, bvrf, bvrf->bg_shop6,
&bvrf->bg_ev[2]);
} else if (sd == bvrf->bg_mhop6) {
EVENT_OFF(bvrf->bg_ev[3]);
event_add_read(master, bfd_recv_cb, bvrf, bvrf->bg_mhop6,
&bvrf->bg_ev[3]);
} else if (sd == bvrf->bg_echo) {
EVENT_OFF(bvrf->bg_ev[4]);
event_add_read(master, bfd_recv_cb, bvrf, bvrf->bg_echo,
&bvrf->bg_ev[4]);
} else if (sd == bvrf->bg_echov6) {
EVENT_OFF(bvrf->bg_ev[5]);
event_add_read(master, bfd_recv_cb, bvrf, bvrf->bg_echov6,
&bvrf->bg_ev[5]);
} else if (sd == bvrf->bg_initv6) {
EVENT_OFF(bvrf->bg_ev[6]);
event_add_read(master, bfd_recv_cb, bvrf, bvrf->bg_initv6, &bvrf->bg_ev[6]);
}
}
PRINTFRR(6, 7)
static void cp_debug(bool mhop, struct sockaddr_any *peer,
struct sockaddr_any *local, ifindex_t ifindex,
vrf_id_t vrfid, const char *fmt, ...)
{
char buf[512], peerstr[128], localstr[128], portstr[64], vrfstr[64];
va_list vl;
/* Don't to any processing if debug is disabled. */
if (bglobal.debug_network == false)
return;
if (peer->sa_sin.sin_family)
snprintf(peerstr, sizeof(peerstr), " peer:%s", satostr(peer));
else
peerstr[0] = 0;
if (local->sa_sin.sin_family)
snprintf(localstr, sizeof(localstr), " local:%s",
satostr(local));
else
localstr[0] = 0;
if (ifindex != IFINDEX_INTERNAL)
snprintf(portstr, sizeof(portstr), " port:%u", ifindex);
else
portstr[0] = 0;
if (vrfid != VRF_DEFAULT)
snprintf(vrfstr, sizeof(vrfstr), " vrf:%u", vrfid);
else
vrfstr[0] = 0;
va_start(vl, fmt);
vsnprintf(buf, sizeof(buf), fmt, vl);
va_end(vl);
zlog_debug("control-packet: %s [mhop:%s%s%s%s%s]", buf,
mhop ? "yes" : "no", peerstr, localstr, portstr, vrfstr);
}
static bool bfd_check_auth(const struct bfd_session *bfd,
const struct bfd_pkt *cp)
{
if (CHECK_FLAG(cp->flags, BFD_ABIT)) {
/* RFC5880 4.1: Authentication Section is present. */
struct bfd_auth *auth = (struct bfd_auth *)(cp + 1);
uint16_t pkt_auth_type = ntohs(auth->type);
if (cp->len < BFD_PKT_LEN + sizeof(struct bfd_auth))
return false;
if (cp->len < BFD_PKT_LEN + auth->length)
return false;
switch (pkt_auth_type) {
case BFD_AUTH_NULL:
return false;
case BFD_AUTH_SIMPLE:
/* RFC5880 6.7: To be finshed. */
return false;
case BFD_AUTH_CRYPTOGRAPHIC:
/* RFC5880 6.7: To be finshed. */
return false;
default:
/* RFC5880 6.7: To be finshed. */
return false;
}
}
return true;
}
void bfd_recv_cb(struct event *t)
{
int sd = EVENT_FD(t);
struct bfd_session *bfd;
struct bfd_pkt *cp;
bool is_mhop;
ssize_t mlen = 0;
uint8_t ttl = 0;
vrf_id_t vrfid;
ifindex_t ifindex = IFINDEX_INTERNAL;
struct sockaddr_any local, peer;
uint8_t msgbuf[1516];
struct interface *ifp = NULL;
struct bfd_vrf_global *bvrf = EVENT_ARG(t);
/* Schedule next read. */
bfd_sd_reschedule(bvrf, sd);
/* The reflector handle SBFD init packets. */
if (sd == bvrf->bg_initv6) {
ptm_bfd_reflector_process_init_packet(bvrf, sd);
return;
}
/* Handle echo packets. */
if (sd == bvrf->bg_echo || sd == bvrf->bg_echov6) {
ptm_bfd_process_echo_pkt(bvrf, sd);
return;
}
/* Sanitize input/output. */
memset(&local, 0, sizeof(local));
memset(&peer, 0, sizeof(peer));
/* Handle control packets. */
is_mhop = false;
if (sd == bvrf->bg_shop || sd == bvrf->bg_mhop) {
is_mhop = sd == bvrf->bg_mhop;
mlen = bfd_recv_ipv4(sd, msgbuf, sizeof(msgbuf), &ttl, &ifindex,
&local, &peer);
} else if (sd == bvrf->bg_shop6 || sd == bvrf->bg_mhop6) {
is_mhop = sd == bvrf->bg_mhop6;
mlen = bfd_recv_ipv6(sd, msgbuf, sizeof(msgbuf), &ttl, &ifindex,
&local, &peer);
}
/*
* With netns backend, we have a separate socket in each VRF. It means
* that bvrf here is correct and we believe the bvrf->vrf->vrf_id.
* With VRF-lite backend, we have a single socket in the default VRF.
* It means that we can't believe the bvrf->vrf->vrf_id. But in
* VRF-lite, the ifindex is globally unique, so we can retrieve the
* correct vrf_id from the interface.
*/
vrfid = bvrf->vrf->vrf_id;
if (ifindex) {
ifp = if_lookup_by_index(ifindex, vrfid);
if (ifp)
vrfid = ifp->vrf->vrf_id;
}
/* Implement RFC 5880 6.8.6 */
if (mlen < BFD_PKT_LEN) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"too small (%zd bytes)", mlen);
return;
}
/* Validate single hop packet TTL. */
if ((!is_mhop) && (ttl != BFD_TTL_VAL)) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"invalid TTL: %d expected %d", ttl, BFD_TTL_VAL);
return;
}
/*
* Parse the control header for inconsistencies:
* - Invalid version;
* - Bad multiplier configuration;
* - Short packets;
* - Invalid discriminator;
*/
cp = (struct bfd_pkt *)(msgbuf);
if (BFD_GETVER(cp->diag) != BFD_VERSION) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"bad version %d", BFD_GETVER(cp->diag));
return;
}
if (cp->detect_mult == 0) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"detect multiplier set to zero");
return;
}
if ((cp->len < BFD_PKT_LEN) || (cp->len > mlen)) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid, "too small");
return;
}
if (BFD_GETMBIT(cp->flags)) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"detect non-zero Multipoint (M) flag");
return;
}
if (cp->discrs.my_discr == 0) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"'my discriminator' is zero");
return;
}
/* Find the session that this packet belongs. */
bfd = ptm_bfd_sess_find(cp, &peer, &local, ifp, vrfid, is_mhop);
if (bfd == NULL) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"no session found");
return;
}
bfdd: allow l3vrf bfd sessions without udp leaking Until now, when in vrf-lite mode, the BFD implementation creates a single UDP socket and relies on the following sysctl value to 1: echo 1 > /proc/sys/net/ipv4/udp_l3mdev_accept With this setting, the incoming BFD packets from a given vrf, would leak to the default vrf, and would match the UDP socket. The drawback of this solution is that udp packets received on a given vrf may leak to an other vrf. This may be a security concern. The commit addresses this issue by avoiding this leak mechanism. An UDP socket is created for each vrf, and each socket uses new setsockopt option: SO_REUSEADDR + SO_REUSEPORT. With this option, the incoming UDP packets are distributed on the available sockets. The impact of those options with l3mdev devices is unknown. It has been observed that this option is not needed, until the default vrf sockets are created. To ensure the BFD packets are correctly routed to the appropriate socket, a BPF filter has been put in place and attached to the sockets : SO_ATTACH_REUSEPORT_CBPF. This option adds a criterium to force the packet to choose a given socket. If initial criteria from the default distribution algorithm were not good, at least two sockets would be available, and the CBPF would force the selection to the same socket. This would come to the situation where an incoming packet would be processed on a different vrf. The bpf code is the following one: struct sock_filter code[] = { { BPF_RET | BPF_K, 0, 0, 0 }, }; struct sock_fprog p = { .len = sizeof(code)/sizeof(struct sock_filter), .filter = code, }; if (setsockopt(sd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) { zlog_warn("unable to set SO_ATTACH_REUSEPORT_CBPF on socket: %s", strerror(errno)); return -1; } Some tests have been done with by creating vrf contexts, and by using the below vtysh configuration: ip route 2.2.2.2/32 10.126.0.2 vrf vrf2 ip route 2.2.2.2/32 10.126.0.2 ! interface ntfp2 ip address 10.126.0.1/24 ! interface ntfp3 vrf vrf4 ip address 10.126.0.1/24 ! interface ntfp2 vrf vrf1 ip address 10.126.0.1/24 ! interface ntfp2.100 vrf vrf2 ip address 10.126.0.1/24 ! interface ntfp2.200 vrf vrf3 ip address 10.126.0.1/24 ! line vty ! bfd peer 10.126.0.2 vrf vrf2 ! peer 10.126.0.2 vrf vrf3 ! peer 10.126.0.2 ! peer 10.126.0.2 vrf vrf4 ! peer 2.2.2.2 multihop local-address 1.1.1.1 ! peer 2.2.2.2 multihop local-address 1.1.1.1 vrf vrf2 transmit-interval 1500 receive-interval 1500 ! The results showed no issue related to packets received by the wrong vrf. Even changing the udp_l3mdev_accept flag to 1 did not change the test results. Signed-off-by: Philippe Guibert <philippe.guibert@6wind.com>
2022-07-07 14:33:48 +02:00
/*
* We may have a situation where received packet is on wrong vrf
*/
if (bfd && bfd->vrf && bfd->vrf->vrf_id != vrfid) {
bfdd: allow l3vrf bfd sessions without udp leaking Until now, when in vrf-lite mode, the BFD implementation creates a single UDP socket and relies on the following sysctl value to 1: echo 1 > /proc/sys/net/ipv4/udp_l3mdev_accept With this setting, the incoming BFD packets from a given vrf, would leak to the default vrf, and would match the UDP socket. The drawback of this solution is that udp packets received on a given vrf may leak to an other vrf. This may be a security concern. The commit addresses this issue by avoiding this leak mechanism. An UDP socket is created for each vrf, and each socket uses new setsockopt option: SO_REUSEADDR + SO_REUSEPORT. With this option, the incoming UDP packets are distributed on the available sockets. The impact of those options with l3mdev devices is unknown. It has been observed that this option is not needed, until the default vrf sockets are created. To ensure the BFD packets are correctly routed to the appropriate socket, a BPF filter has been put in place and attached to the sockets : SO_ATTACH_REUSEPORT_CBPF. This option adds a criterium to force the packet to choose a given socket. If initial criteria from the default distribution algorithm were not good, at least two sockets would be available, and the CBPF would force the selection to the same socket. This would come to the situation where an incoming packet would be processed on a different vrf. The bpf code is the following one: struct sock_filter code[] = { { BPF_RET | BPF_K, 0, 0, 0 }, }; struct sock_fprog p = { .len = sizeof(code)/sizeof(struct sock_filter), .filter = code, }; if (setsockopt(sd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) { zlog_warn("unable to set SO_ATTACH_REUSEPORT_CBPF on socket: %s", strerror(errno)); return -1; } Some tests have been done with by creating vrf contexts, and by using the below vtysh configuration: ip route 2.2.2.2/32 10.126.0.2 vrf vrf2 ip route 2.2.2.2/32 10.126.0.2 ! interface ntfp2 ip address 10.126.0.1/24 ! interface ntfp3 vrf vrf4 ip address 10.126.0.1/24 ! interface ntfp2 vrf vrf1 ip address 10.126.0.1/24 ! interface ntfp2.100 vrf vrf2 ip address 10.126.0.1/24 ! interface ntfp2.200 vrf vrf3 ip address 10.126.0.1/24 ! line vty ! bfd peer 10.126.0.2 vrf vrf2 ! peer 10.126.0.2 vrf vrf3 ! peer 10.126.0.2 ! peer 10.126.0.2 vrf vrf4 ! peer 2.2.2.2 multihop local-address 1.1.1.1 ! peer 2.2.2.2 multihop local-address 1.1.1.1 vrf vrf2 transmit-interval 1500 receive-interval 1500 ! The results showed no issue related to packets received by the wrong vrf. Even changing the udp_l3mdev_accept flag to 1 did not change the test results. Signed-off-by: Philippe Guibert <philippe.guibert@6wind.com>
2022-07-07 14:33:48 +02:00
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"wrong vrfid.");
return;
}
/* Ensure that existing good sessions are not overridden. */
if (!cp->discrs.remote_discr && bfd->ses_state != PTM_BFD_DOWN &&
bfd->ses_state != PTM_BFD_ADM_DOWN) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"'remote discriminator' is zero, not overridden");
return;
}
/*
* Multi hop: validate packet TTL.
* Single hop: set local address that received the packet.
* set peers mac address for echo packets
*/
if (is_mhop) {
if (ttl < bfd->mh_ttl) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"exceeded max hop count (expected %d, got %d)",
bfd->mh_ttl, ttl);
return;
}
} else {
if (bfd->local_address.sa_sin.sin_family == AF_UNSPEC)
bfd->local_address = local;
#ifdef BFD_LINUX
if (ifp)
bfd_peer_mac_set(sd, bfd, &peer, ifp);
#endif
}
bfd->stats.rx_ctrl_pkt++;
/*
* If no interface was detected, save the interface where the
* packet came in.
*/
if (!is_mhop && bfd->ifp == NULL)
bfd->ifp = ifp;
/* Log remote discriminator changes. */
if ((bfd->discrs.remote_discr != 0)
&& (bfd->discrs.remote_discr != ntohl(cp->discrs.my_discr)))
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"remote discriminator mismatch (expected %u, got %u)",
bfd->discrs.remote_discr, ntohl(cp->discrs.my_discr));
bfd->discrs.remote_discr = ntohl(cp->discrs.my_discr);
/* Check authentication. */
if (!bfd_check_auth(bfd, cp)) {
cp_debug(is_mhop, &peer, &local, ifindex, vrfid,
"Authentication failed");
return;
}
/* Save remote diagnostics before state switch. */
bfd->remote_diag = CHECK_FLAG(cp->diag, BFD_DIAGMASK);
/* Update remote timers settings. */
bfd->remote_timers.desired_min_tx = ntohl(cp->timers.desired_min_tx);
bfd->remote_timers.required_min_rx = ntohl(cp->timers.required_min_rx);
bfd->remote_timers.required_min_echo =
ntohl(cp->timers.required_min_echo);
bfd->remote_detect_mult = cp->detect_mult;
if (BFD_GETCBIT(cp->flags))
bfd->remote_cbit = 1;
else
bfd->remote_cbit = 0;
/* The initiator handle SBFD reflect packet. */
if (bfd->bfd_mode == BFD_MODE_TYPE_SBFD_INIT) {
sbfd_initiator_state_handler(bfd, PTM_BFD_UP);
if (bfd->xmt_TO != bfd->timers.desired_min_tx) {
bfd->xmt_TO = bfd->timers.desired_min_tx;
//reset xmt timer TO after UP
ptm_bfd_start_xmt_timer(bfd, false);
}
bfd->detect_TO = bfd->detect_mult * bfd->xmt_TO;
sbfd_init_recvtimer_update(bfd);
if (bfd->polling && BFD_GETFBIT(cp->flags)) {
/* Disable polling. */
bfd->polling = 0;
/* Start using our new timers. */
bfd->cur_timers.desired_min_tx = bfd->timers.desired_min_tx;
bfd->cur_timers.required_min_rx = bfd->timers.required_min_rx;
}
return;
}
/* State switch from section 6.2. */
bs_state_handler(bfd, BFD_GETSTATE(cp->flags));
/* RFC 5880, Section 6.5: handle POLL/FINAL negotiation sequence. */
if (bfd->polling && BFD_GETFBIT(cp->flags)) {
/* Disable polling. */
bfd->polling = 0;
/* Handle poll finalization. */
bs_final_handler(bfd);
}
/*
* Detection timeout calculation:
* The minimum detection timeout is the remote detection
* multipler (number of packets to be missed) times the agreed
* transmission interval.
*
* RFC 5880, Section 6.8.4.
*/
if (bfd->cur_timers.required_min_rx > bfd->remote_timers.desired_min_tx)
bfd->detect_TO = bfd->remote_detect_mult
* bfd->cur_timers.required_min_rx;
else
bfd->detect_TO = bfd->remote_detect_mult
* bfd->remote_timers.desired_min_tx;
/* Apply new receive timer immediately. */
bfd_recvtimer_update(bfd);
/* Handle echo timers changes. */
bs_echo_timer_handler(bfd);
/*
* We've received a packet with the POLL bit set, we must send
* a control packet back with the FINAL bit set.
*
* RFC 5880, Section 6.5.
*/
if (BFD_GETPBIT(cp->flags)) {
/* We are finalizing a poll negotiation. */
bs_final_handler(bfd);
/* Send the control packet with the final bit immediately. */
ptm_bfd_snd(bfd, 1);
}
}
/*
* bp_bfd_echo_in: proccesses an BFD echo packet. On TTL == BFD_TTL_VAL
* the packet is looped back or returns the my discriminator ID along
* with the TTL.
*
* Returns -1 on error or loopback or 0 on success.
*/
int bp_bfd_echo_in(struct bfd_vrf_global *bvrf, int sd, uint8_t *ttl,
uint32_t *my_discr, uint64_t *my_rtt)
{
struct bfd_echo_pkt *bep;
ssize_t rlen;
struct sockaddr_any local, peer;
ifindex_t ifindex = IFINDEX_INTERNAL;
vrf_id_t vrfid = VRF_DEFAULT;
uint8_t msgbuf[1516];
size_t bfd_offset = 0;
if (sd == bvrf->bg_echo) {
#ifdef BFD_LINUX
rlen = bfd_recv_ipv4_fp(sd, msgbuf, sizeof(msgbuf), ttl,
&ifindex, &local, &peer);
/* silently drop echo packet that is looped in fastpath but
* still comes up to BFD
*/
if (rlen == -1)
return -1;
bfd_offset = sizeof(struct udphdr) + sizeof(struct iphdr) +
sizeof(struct ethhdr);
#else
rlen = bfd_recv_ipv4(sd, msgbuf, sizeof(msgbuf), ttl, &ifindex,
&local, &peer);
bfd_offset = 0;
#endif
} else {
rlen = bfd_recv_ipv6(sd, msgbuf, sizeof(msgbuf), ttl, &ifindex,
&local, &peer);
bfd_offset = 0;
}
/* Short packet, better not risk reading it. */
if (rlen < (ssize_t)sizeof(*bep)) {
cp_debug(false, &peer, &local, ifindex, vrfid,
"small echo packet");
return -1;
}
/* Test for loopback for ipv6, ipv4 is looped in forwarding plane */
if ((*ttl == BFD_TTL_VAL) && (sd == bvrf->bg_echov6)) {
bp_udp_send(sd, *ttl - 1, msgbuf, rlen,
(struct sockaddr *)&peer,
(sd == bvrf->bg_echo) ? sizeof(peer.sa_sin)
: sizeof(peer.sa_sin6));
return -1;
}
/* Read my discriminator from BFD Echo packet. */
bep = (struct bfd_echo_pkt *)(msgbuf + bfd_offset);
*my_discr = ntohl(bep->my_discr);
if (*my_discr == 0) {
cp_debug(false, &peer, &local, ifindex, vrfid,
"invalid echo packet discriminator (zero)");
return -1;
}
#ifdef BFD_LINUX
/* RTT Calculation: determine RTT time of IPv4 echo pkt */
if (sd == bvrf->bg_echo) {
struct timeval time_sent = {0, 0};
time_sent.tv_sec = be64toh(bep->time_sent_sec);
time_sent.tv_usec = be64toh(bep->time_sent_usec);
*my_rtt = monotime_since(&time_sent, NULL);
}
#endif
return 0;
}
#ifdef BFD_LINUX
/*
* send a bfd packet with src/dst same IP so that the peer will receive
* the packet and forward it back to sender in the forwarding plane
*/
int bp_udp_send_fp(int sd, uint8_t *data, size_t datalen,
struct bfd_session *bfd)
{
ssize_t wlen;
struct msghdr msg = {0};
struct iovec iov[1];
uint8_t msgctl[255];
struct sockaddr_ll sadr_ll = {0};
sadr_ll.sll_ifindex = bfd->ifp->ifindex;
sadr_ll.sll_halen = ETH_ALEN;
memcpy(sadr_ll.sll_addr, bfd->peer_hw_addr, sizeof(bfd->peer_hw_addr));
sadr_ll.sll_protocol = htons(ETH_P_IP);
/* Prepare message data. */
iov[0].iov_base = data;
iov[0].iov_len = datalen;
memset(msgctl, 0, sizeof(msgctl));
msg.msg_name = &sadr_ll;
msg.msg_namelen = sizeof(sadr_ll);
msg.msg_iov = iov;
msg.msg_iovlen = 1;
/* Send echo to peer */
wlen = sendmsg(sd, &msg, 0);
if (wlen <= 0) {
if (bglobal.debug_network)
zlog_debug("%s: loopback failure: (%d) %s", __func__,
errno, strerror(errno));
return -1;
} else if (wlen < (ssize_t)datalen) {
if (bglobal.debug_network)
zlog_debug("%s: partial send: %zd expected %zu",
__func__, wlen, datalen);
return -1;
}
return 0;
}
#endif
int bp_udp_send(int sd, uint8_t ttl, uint8_t *data, size_t datalen,
struct sockaddr *to, socklen_t tolen)
{
struct cmsghdr *cmsg;
ssize_t wlen;
int ttlval = ttl;
bool is_ipv6 = to->sa_family == AF_INET6;
struct msghdr msg;
struct iovec iov[1];
uint8_t msgctl[255];
/* Prepare message data. */
iov[0].iov_base = data;
iov[0].iov_len = datalen;
memset(&msg, 0, sizeof(msg));
memset(msgctl, 0, sizeof(msgctl));
msg.msg_name = to;
msg.msg_namelen = tolen;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
/* Prepare the packet TTL information. */
if (ttl > 0) {
/* Use ancillary data. */
msg.msg_control = msgctl;
msg.msg_controllen = CMSG_LEN(sizeof(ttlval));
/* Configure the ancillary data. */
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_len = CMSG_LEN(sizeof(ttlval));
if (is_ipv6) {
cmsg->cmsg_level = IPPROTO_IPV6;
cmsg->cmsg_type = IPV6_HOPLIMIT;
} else {
#ifdef BFD_LINUX
cmsg->cmsg_level = IPPROTO_IP;
cmsg->cmsg_type = IP_TTL;
#else
/* FreeBSD does not support TTL in ancillary data. */
msg.msg_control = NULL;
msg.msg_controllen = 0;
bp_set_ttl(sd, ttl);
#endif /* BFD_BSD */
}
memcpy(CMSG_DATA(cmsg), &ttlval, sizeof(ttlval));
}
/* Send echo back. */
wlen = sendmsg(sd, &msg, 0);
if (wlen <= 0) {
if (bglobal.debug_network)
zlog_debug("%s: loopback failure: (%d) %s", __func__,
errno, strerror(errno));
return -1;
} else if (wlen < (ssize_t)datalen) {
if (bglobal.debug_network)
zlog_debug("%s: partial send: %zd expected %zu",
__func__, wlen, datalen);
return -1;
}
return 0;
}
/*
* Sockets creation.
*/
/*
* IPv4 sockets
*/
int bp_set_ttl(int sd, uint8_t value)
{
int ttl = value;
if (setsockopt(sd, IPPROTO_IP, IP_TTL, &ttl, sizeof(ttl)) == -1) {
zlog_warn("%s: setsockopt(IP_TTL, %d): %s", __func__, value,
strerror(errno));
return -1;
}
return 0;
}
int bp_set_tos(int sd, uint8_t value)
{
int tos = value;
if (setsockopt(sd, IPPROTO_IP, IP_TOS, &tos, sizeof(tos)) == -1) {
zlog_warn("%s: setsockopt(IP_TOS, %d): %s", __func__, value,
strerror(errno));
return -1;
}
return 0;
}
bfdd: allow l3vrf bfd sessions without udp leaking Until now, when in vrf-lite mode, the BFD implementation creates a single UDP socket and relies on the following sysctl value to 1: echo 1 > /proc/sys/net/ipv4/udp_l3mdev_accept With this setting, the incoming BFD packets from a given vrf, would leak to the default vrf, and would match the UDP socket. The drawback of this solution is that udp packets received on a given vrf may leak to an other vrf. This may be a security concern. The commit addresses this issue by avoiding this leak mechanism. An UDP socket is created for each vrf, and each socket uses new setsockopt option: SO_REUSEADDR + SO_REUSEPORT. With this option, the incoming UDP packets are distributed on the available sockets. The impact of those options with l3mdev devices is unknown. It has been observed that this option is not needed, until the default vrf sockets are created. To ensure the BFD packets are correctly routed to the appropriate socket, a BPF filter has been put in place and attached to the sockets : SO_ATTACH_REUSEPORT_CBPF. This option adds a criterium to force the packet to choose a given socket. If initial criteria from the default distribution algorithm were not good, at least two sockets would be available, and the CBPF would force the selection to the same socket. This would come to the situation where an incoming packet would be processed on a different vrf. The bpf code is the following one: struct sock_filter code[] = { { BPF_RET | BPF_K, 0, 0, 0 }, }; struct sock_fprog p = { .len = sizeof(code)/sizeof(struct sock_filter), .filter = code, }; if (setsockopt(sd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) { zlog_warn("unable to set SO_ATTACH_REUSEPORT_CBPF on socket: %s", strerror(errno)); return -1; } Some tests have been done with by creating vrf contexts, and by using the below vtysh configuration: ip route 2.2.2.2/32 10.126.0.2 vrf vrf2 ip route 2.2.2.2/32 10.126.0.2 ! interface ntfp2 ip address 10.126.0.1/24 ! interface ntfp3 vrf vrf4 ip address 10.126.0.1/24 ! interface ntfp2 vrf vrf1 ip address 10.126.0.1/24 ! interface ntfp2.100 vrf vrf2 ip address 10.126.0.1/24 ! interface ntfp2.200 vrf vrf3 ip address 10.126.0.1/24 ! line vty ! bfd peer 10.126.0.2 vrf vrf2 ! peer 10.126.0.2 vrf vrf3 ! peer 10.126.0.2 ! peer 10.126.0.2 vrf vrf4 ! peer 2.2.2.2 multihop local-address 1.1.1.1 ! peer 2.2.2.2 multihop local-address 1.1.1.1 vrf vrf2 transmit-interval 1500 receive-interval 1500 ! The results showed no issue related to packets received by the wrong vrf. Even changing the udp_l3mdev_accept flag to 1 did not change the test results. Signed-off-by: Philippe Guibert <philippe.guibert@6wind.com>
2022-07-07 14:33:48 +02:00
static bool bp_set_reuse_addr(int sd)
{
int one = 1;
if (setsockopt(sd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) == -1) {
zlog_warn("%s: setsockopt(SO_REUSEADDR, %d): %s", __func__, one,
strerror(errno));
bfdd: allow l3vrf bfd sessions without udp leaking Until now, when in vrf-lite mode, the BFD implementation creates a single UDP socket and relies on the following sysctl value to 1: echo 1 > /proc/sys/net/ipv4/udp_l3mdev_accept With this setting, the incoming BFD packets from a given vrf, would leak to the default vrf, and would match the UDP socket. The drawback of this solution is that udp packets received on a given vrf may leak to an other vrf. This may be a security concern. The commit addresses this issue by avoiding this leak mechanism. An UDP socket is created for each vrf, and each socket uses new setsockopt option: SO_REUSEADDR + SO_REUSEPORT. With this option, the incoming UDP packets are distributed on the available sockets. The impact of those options with l3mdev devices is unknown. It has been observed that this option is not needed, until the default vrf sockets are created. To ensure the BFD packets are correctly routed to the appropriate socket, a BPF filter has been put in place and attached to the sockets : SO_ATTACH_REUSEPORT_CBPF. This option adds a criterium to force the packet to choose a given socket. If initial criteria from the default distribution algorithm were not good, at least two sockets would be available, and the CBPF would force the selection to the same socket. This would come to the situation where an incoming packet would be processed on a different vrf. The bpf code is the following one: struct sock_filter code[] = { { BPF_RET | BPF_K, 0, 0, 0 }, }; struct sock_fprog p = { .len = sizeof(code)/sizeof(struct sock_filter), .filter = code, }; if (setsockopt(sd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) { zlog_warn("unable to set SO_ATTACH_REUSEPORT_CBPF on socket: %s", strerror(errno)); return -1; } Some tests have been done with by creating vrf contexts, and by using the below vtysh configuration: ip route 2.2.2.2/32 10.126.0.2 vrf vrf2 ip route 2.2.2.2/32 10.126.0.2 ! interface ntfp2 ip address 10.126.0.1/24 ! interface ntfp3 vrf vrf4 ip address 10.126.0.1/24 ! interface ntfp2 vrf vrf1 ip address 10.126.0.1/24 ! interface ntfp2.100 vrf vrf2 ip address 10.126.0.1/24 ! interface ntfp2.200 vrf vrf3 ip address 10.126.0.1/24 ! line vty ! bfd peer 10.126.0.2 vrf vrf2 ! peer 10.126.0.2 vrf vrf3 ! peer 10.126.0.2 ! peer 10.126.0.2 vrf vrf4 ! peer 2.2.2.2 multihop local-address 1.1.1.1 ! peer 2.2.2.2 multihop local-address 1.1.1.1 vrf vrf2 transmit-interval 1500 receive-interval 1500 ! The results showed no issue related to packets received by the wrong vrf. Even changing the udp_l3mdev_accept flag to 1 did not change the test results. Signed-off-by: Philippe Guibert <philippe.guibert@6wind.com>
2022-07-07 14:33:48 +02:00
return false;
}
return true;
}
static bool bp_set_reuse_port(int sd)
{
int one = 1;
if (setsockopt(sd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) == -1) {
zlog_warn("%s: setsockopt(SO_REUSEPORT, %d): %s", __func__, one,
strerror(errno));
bfdd: allow l3vrf bfd sessions without udp leaking Until now, when in vrf-lite mode, the BFD implementation creates a single UDP socket and relies on the following sysctl value to 1: echo 1 > /proc/sys/net/ipv4/udp_l3mdev_accept With this setting, the incoming BFD packets from a given vrf, would leak to the default vrf, and would match the UDP socket. The drawback of this solution is that udp packets received on a given vrf may leak to an other vrf. This may be a security concern. The commit addresses this issue by avoiding this leak mechanism. An UDP socket is created for each vrf, and each socket uses new setsockopt option: SO_REUSEADDR + SO_REUSEPORT. With this option, the incoming UDP packets are distributed on the available sockets. The impact of those options with l3mdev devices is unknown. It has been observed that this option is not needed, until the default vrf sockets are created. To ensure the BFD packets are correctly routed to the appropriate socket, a BPF filter has been put in place and attached to the sockets : SO_ATTACH_REUSEPORT_CBPF. This option adds a criterium to force the packet to choose a given socket. If initial criteria from the default distribution algorithm were not good, at least two sockets would be available, and the CBPF would force the selection to the same socket. This would come to the situation where an incoming packet would be processed on a different vrf. The bpf code is the following one: struct sock_filter code[] = { { BPF_RET | BPF_K, 0, 0, 0 }, }; struct sock_fprog p = { .len = sizeof(code)/sizeof(struct sock_filter), .filter = code, }; if (setsockopt(sd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) { zlog_warn("unable to set SO_ATTACH_REUSEPORT_CBPF on socket: %s", strerror(errno)); return -1; } Some tests have been done with by creating vrf contexts, and by using the below vtysh configuration: ip route 2.2.2.2/32 10.126.0.2 vrf vrf2 ip route 2.2.2.2/32 10.126.0.2 ! interface ntfp2 ip address 10.126.0.1/24 ! interface ntfp3 vrf vrf4 ip address 10.126.0.1/24 ! interface ntfp2 vrf vrf1 ip address 10.126.0.1/24 ! interface ntfp2.100 vrf vrf2 ip address 10.126.0.1/24 ! interface ntfp2.200 vrf vrf3 ip address 10.126.0.1/24 ! line vty ! bfd peer 10.126.0.2 vrf vrf2 ! peer 10.126.0.2 vrf vrf3 ! peer 10.126.0.2 ! peer 10.126.0.2 vrf vrf4 ! peer 2.2.2.2 multihop local-address 1.1.1.1 ! peer 2.2.2.2 multihop local-address 1.1.1.1 vrf vrf2 transmit-interval 1500 receive-interval 1500 ! The results showed no issue related to packets received by the wrong vrf. Even changing the udp_l3mdev_accept flag to 1 did not change the test results. Signed-off-by: Philippe Guibert <philippe.guibert@6wind.com>
2022-07-07 14:33:48 +02:00
return false;
}
return true;
}
static void bp_set_ipopts(int sd)
{
int rcvttl = BFD_RCV_TTL_VAL;
bfdd: allow l3vrf bfd sessions without udp leaking Until now, when in vrf-lite mode, the BFD implementation creates a single UDP socket and relies on the following sysctl value to 1: echo 1 > /proc/sys/net/ipv4/udp_l3mdev_accept With this setting, the incoming BFD packets from a given vrf, would leak to the default vrf, and would match the UDP socket. The drawback of this solution is that udp packets received on a given vrf may leak to an other vrf. This may be a security concern. The commit addresses this issue by avoiding this leak mechanism. An UDP socket is created for each vrf, and each socket uses new setsockopt option: SO_REUSEADDR + SO_REUSEPORT. With this option, the incoming UDP packets are distributed on the available sockets. The impact of those options with l3mdev devices is unknown. It has been observed that this option is not needed, until the default vrf sockets are created. To ensure the BFD packets are correctly routed to the appropriate socket, a BPF filter has been put in place and attached to the sockets : SO_ATTACH_REUSEPORT_CBPF. This option adds a criterium to force the packet to choose a given socket. If initial criteria from the default distribution algorithm were not good, at least two sockets would be available, and the CBPF would force the selection to the same socket. This would come to the situation where an incoming packet would be processed on a different vrf. The bpf code is the following one: struct sock_filter code[] = { { BPF_RET | BPF_K, 0, 0, 0 }, }; struct sock_fprog p = { .len = sizeof(code)/sizeof(struct sock_filter), .filter = code, }; if (setsockopt(sd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) { zlog_warn("unable to set SO_ATTACH_REUSEPORT_CBPF on socket: %s", strerror(errno)); return -1; } Some tests have been done with by creating vrf contexts, and by using the below vtysh configuration: ip route 2.2.2.2/32 10.126.0.2 vrf vrf2 ip route 2.2.2.2/32 10.126.0.2 ! interface ntfp2 ip address 10.126.0.1/24 ! interface ntfp3 vrf vrf4 ip address 10.126.0.1/24 ! interface ntfp2 vrf vrf1 ip address 10.126.0.1/24 ! interface ntfp2.100 vrf vrf2 ip address 10.126.0.1/24 ! interface ntfp2.200 vrf vrf3 ip address 10.126.0.1/24 ! line vty ! bfd peer 10.126.0.2 vrf vrf2 ! peer 10.126.0.2 vrf vrf3 ! peer 10.126.0.2 ! peer 10.126.0.2 vrf vrf4 ! peer 2.2.2.2 multihop local-address 1.1.1.1 ! peer 2.2.2.2 multihop local-address 1.1.1.1 vrf vrf2 transmit-interval 1500 receive-interval 1500 ! The results showed no issue related to packets received by the wrong vrf. Even changing the udp_l3mdev_accept flag to 1 did not change the test results. Signed-off-by: Philippe Guibert <philippe.guibert@6wind.com>
2022-07-07 14:33:48 +02:00
if (!bp_set_reuse_addr(sd))
zlog_fatal("set-reuse-addr: failed");
if (!bp_set_reuse_port(sd))
zlog_fatal("set-reuse-port: failed");
if (bp_set_ttl(sd, BFD_TTL_VAL) != 0)
zlog_fatal("set-ipopts: TTL configuration failed");
if (setsockopt(sd, IPPROTO_IP, IP_RECVTTL, &rcvttl, sizeof(rcvttl))
== -1)
zlog_fatal("set-ipopts: setsockopt(IP_RECVTTL, %d): %s", rcvttl,
strerror(errno));
#ifdef BFD_LINUX
int pktinfo = BFD_PKT_INFO_VAL;
/* Figure out address and interface to do the peer matching. */
if (setsockopt(sd, IPPROTO_IP, IP_PKTINFO, &pktinfo, sizeof(pktinfo))
== -1)
zlog_fatal("set-ipopts: setsockopt(IP_PKTINFO, %d): %s",
pktinfo, strerror(errno));
#endif /* BFD_LINUX */
#ifdef BFD_BSD
int yes = 1;
/* Find out our address for peer matching. */
if (setsockopt(sd, IPPROTO_IP, IP_RECVDSTADDR, &yes, sizeof(yes)) == -1)
zlog_fatal("set-ipopts: setsockopt(IP_RECVDSTADDR, %d): %s",
yes, strerror(errno));
/* Find out interface where the packet came in. */
if (setsockopt_ifindex(AF_INET, sd, yes) == -1)
zlog_fatal("set-ipopts: setsockopt_ipv4_ifindex(%d): %s", yes,
strerror(errno));
#endif /* BFD_BSD */
}
static void bp_bind_ip(int sd, uint16_t port)
{
struct sockaddr_in sin;
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(port);
if (bind(sd, (struct sockaddr *)&sin, sizeof(sin)) == -1)
zlog_fatal("bind-ip: bind: %s", strerror(errno));
}
void bp_set_prio(int sd, int value)
{
#if defined(GNU_LINUX)
int priority = value;
if (setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &priority, sizeof(priority)) < 0)
zlog_warn("set_prio: setsockopt(SO_PRIORITY, %d): %s", value, strerror(errno));
#endif
}
int bp_udp_shop(const struct vrf *vrf)
{
int sd;
frr_with_privs(&bglobal.bfdd_privs) {
sd = vrf_socket(AF_INET, SOCK_DGRAM, PF_UNSPEC, vrf->vrf_id,
vrf->name);
}
if (sd == -1)
zlog_fatal("udp-shop: socket: %s", strerror(errno));
bp_set_ipopts(sd);
bp_bind_ip(sd, BFD_DEFDESTPORT);
return sd;
}
int bp_udp_mhop(const struct vrf *vrf)
{
int sd;
frr_with_privs(&bglobal.bfdd_privs) {
sd = vrf_socket(AF_INET, SOCK_DGRAM, PF_UNSPEC, vrf->vrf_id,
vrf->name);
}
if (sd == -1)
zlog_fatal("udp-mhop: socket: %s", strerror(errno));
bp_set_ipopts(sd);
bp_bind_ip(sd, BFD_DEF_MHOP_DEST_PORT);
return sd;
}
int bp_peer_socket(const struct bfd_session *bs)
{
int sd, pcount;
struct sockaddr_in sin;
static int srcPort = BFD_SRCPORTINIT;
const char *device_to_bind = NULL;
if (bs->key.ifname[0])
device_to_bind = (const char *)bs->key.ifname;
else if ((!vrf_is_backend_netns() && bs->vrf->vrf_id != VRF_DEFAULT)
|| ((CHECK_FLAG(bs->flags, BFD_SESS_FLAG_MH)
&& bs->key.vrfname[0])))
device_to_bind = (const char *)bs->key.vrfname;
frr_with_privs(&bglobal.bfdd_privs) {
sd = vrf_socket(AF_INET, SOCK_DGRAM, PF_UNSPEC,
bs->vrf->vrf_id, device_to_bind);
}
if (sd == -1) {
zlog_err("ipv4-new: failed to create socket: %s",
strerror(errno));
return -1;
}
/* Set TTL to 255 for all transmitted packets */
if (bp_set_ttl(sd, BFD_TTL_VAL) != 0) {
close(sd);
return -1;
}
/* Set TOS to CS6 for all transmitted packets */
if (bp_set_tos(sd, BFD_TOS_VAL) != 0) {
close(sd);
return -1;
}
bp_set_prio(sd, SOCK_OPT_PRIO_HIGH);
/* Find an available source port in the proper range */
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
sin.sin_len = sizeof(sin);
#endif /* HAVE_STRUCT_SOCKADDR_SA_LEN */
memcpy(&sin.sin_addr, &bs->key.local, sizeof(sin.sin_addr));
pcount = 0;
do {
if ((++pcount) > (BFD_SRCPORTMAX - BFD_SRCPORTINIT)) {
/* Searched all ports, none available */
zlog_err("ipv4-new: failed to bind port: %s",
strerror(errno));
close(sd);
return -1;
}
if (srcPort >= BFD_SRCPORTMAX)
srcPort = BFD_SRCPORTINIT;
sin.sin_port = htons(srcPort++);
} while (bind(sd, (struct sockaddr *)&sin, sizeof(sin)) < 0);
return sd;
}
/*
* IPv6 sockets
*/
int bp_peer_socketv6(const struct bfd_session *bs)
{
int sd, pcount;
struct sockaddr_in6 sin6;
static int srcPort = BFD_SRCPORTINIT;
const char *device_to_bind = NULL;
if (bs->key.ifname[0])
device_to_bind = (const char *)bs->key.ifname;
else if ((!vrf_is_backend_netns() && bs->vrf->vrf_id != VRF_DEFAULT)
|| ((CHECK_FLAG(bs->flags, BFD_SESS_FLAG_MH)
&& bs->key.vrfname[0])))
device_to_bind = (const char *)bs->key.vrfname;
frr_with_privs(&bglobal.bfdd_privs) {
sd = vrf_socket(AF_INET6, SOCK_DGRAM, PF_UNSPEC,
bs->vrf->vrf_id, device_to_bind);
}
if (sd == -1) {
zlog_err("ipv6-new: failed to create socket: %s",
strerror(errno));
return -1;
}
/* Set TTL to 255 for all transmitted packets */
if (bp_set_ttlv6(sd, BFD_TTL_VAL) != 0) {
close(sd);
return -1;
}
/* Set TOS to CS6 for all transmitted packets */
if (bp_set_tosv6(sd, BFD_TOS_VAL) != 0) {
close(sd);
return -1;
}
bp_set_prio(sd, SOCK_OPT_PRIO_HIGH);
/* Find an available source port in the proper range */
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
sin6.sin6_len = sizeof(sin6);
#endif /* HAVE_STRUCT_SOCKADDR_SA_LEN */
memcpy(&sin6.sin6_addr, &bs->key.local, sizeof(sin6.sin6_addr));
if (bs->ifp && IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr))
sin6.sin6_scope_id = bs->ifp->ifindex;
pcount = 0;
do {
if ((++pcount) > (BFD_SRCPORTMAX - BFD_SRCPORTINIT)) {
/* Searched all ports, none available */
zlog_err("ipv6-new: failed to bind port: %s",
strerror(errno));
close(sd);
return -1;
}
if (srcPort >= BFD_SRCPORTMAX)
srcPort = BFD_SRCPORTINIT;
sin6.sin6_port = htons(srcPort++);
} while (bind(sd, (struct sockaddr *)&sin6, sizeof(sin6)) < 0);
return sd;
}
int bp_set_ttlv6(int sd, uint8_t value)
{
int ttl = value;
if (setsockopt(sd, IPPROTO_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl))
== -1) {
zlog_warn("set-ttlv6: setsockopt(IPV6_UNICAST_HOPS, %d): %s",
value, strerror(errno));
return -1;
}
return 0;
}
int bp_set_tosv6(int sd, uint8_t value)
{
int tos = value;
if (setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &tos, sizeof(tos))
== -1) {
zlog_warn("set-tosv6: setsockopt(IPV6_TCLASS, %d): %s", value,
strerror(errno));
return -1;
}
return 0;
}
static void bp_set_ipv6opts(int sd)
{
int ipv6_pktinfo = BFD_IPV6_PKT_INFO_VAL;
int ipv6_only = BFD_IPV6_ONLY_VAL;
bfdd: allow l3vrf bfd sessions without udp leaking Until now, when in vrf-lite mode, the BFD implementation creates a single UDP socket and relies on the following sysctl value to 1: echo 1 > /proc/sys/net/ipv4/udp_l3mdev_accept With this setting, the incoming BFD packets from a given vrf, would leak to the default vrf, and would match the UDP socket. The drawback of this solution is that udp packets received on a given vrf may leak to an other vrf. This may be a security concern. The commit addresses this issue by avoiding this leak mechanism. An UDP socket is created for each vrf, and each socket uses new setsockopt option: SO_REUSEADDR + SO_REUSEPORT. With this option, the incoming UDP packets are distributed on the available sockets. The impact of those options with l3mdev devices is unknown. It has been observed that this option is not needed, until the default vrf sockets are created. To ensure the BFD packets are correctly routed to the appropriate socket, a BPF filter has been put in place and attached to the sockets : SO_ATTACH_REUSEPORT_CBPF. This option adds a criterium to force the packet to choose a given socket. If initial criteria from the default distribution algorithm were not good, at least two sockets would be available, and the CBPF would force the selection to the same socket. This would come to the situation where an incoming packet would be processed on a different vrf. The bpf code is the following one: struct sock_filter code[] = { { BPF_RET | BPF_K, 0, 0, 0 }, }; struct sock_fprog p = { .len = sizeof(code)/sizeof(struct sock_filter), .filter = code, }; if (setsockopt(sd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) { zlog_warn("unable to set SO_ATTACH_REUSEPORT_CBPF on socket: %s", strerror(errno)); return -1; } Some tests have been done with by creating vrf contexts, and by using the below vtysh configuration: ip route 2.2.2.2/32 10.126.0.2 vrf vrf2 ip route 2.2.2.2/32 10.126.0.2 ! interface ntfp2 ip address 10.126.0.1/24 ! interface ntfp3 vrf vrf4 ip address 10.126.0.1/24 ! interface ntfp2 vrf vrf1 ip address 10.126.0.1/24 ! interface ntfp2.100 vrf vrf2 ip address 10.126.0.1/24 ! interface ntfp2.200 vrf vrf3 ip address 10.126.0.1/24 ! line vty ! bfd peer 10.126.0.2 vrf vrf2 ! peer 10.126.0.2 vrf vrf3 ! peer 10.126.0.2 ! peer 10.126.0.2 vrf vrf4 ! peer 2.2.2.2 multihop local-address 1.1.1.1 ! peer 2.2.2.2 multihop local-address 1.1.1.1 vrf vrf2 transmit-interval 1500 receive-interval 1500 ! The results showed no issue related to packets received by the wrong vrf. Even changing the udp_l3mdev_accept flag to 1 did not change the test results. Signed-off-by: Philippe Guibert <philippe.guibert@6wind.com>
2022-07-07 14:33:48 +02:00
if (!bp_set_reuse_addr(sd))
zlog_fatal("set-reuse-addr: failed");
if (!bp_set_reuse_port(sd))
zlog_fatal("set-reuse-port: failed");
if (bp_set_ttlv6(sd, BFD_TTL_VAL) == -1)
zlog_fatal(
"set-ipv6opts: setsockopt(IPV6_UNICAST_HOPS, %d): %s",
BFD_TTL_VAL, strerror(errno));
if (setsockopt_ipv6_hoplimit(sd, BFD_RCV_TTL_VAL) == -1)
zlog_fatal("set-ipv6opts: setsockopt(IPV6_HOPLIMIT, %d): %s",
BFD_RCV_TTL_VAL, strerror(errno));
if (setsockopt_ipv6_pktinfo(sd, ipv6_pktinfo) == -1)
zlog_fatal("set-ipv6opts: setsockopt(IPV6_PKTINFO, %d): %s",
ipv6_pktinfo, strerror(errno));
if (setsockopt(sd, IPPROTO_IPV6, IPV6_V6ONLY, &ipv6_only,
sizeof(ipv6_only))
== -1)
zlog_fatal("set-ipv6opts: setsockopt(IPV6_V6ONLY, %d): %s",
ipv6_only, strerror(errno));
}
static void bp_bind_ipv6(int sd, uint16_t port)
{
struct sockaddr_in6 sin6;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = in6addr_any;
sin6.sin6_port = htons(port);
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
sin6.sin6_len = sizeof(sin6);
#endif /* HAVE_STRUCT_SOCKADDR_SA_LEN */
if (bind(sd, (struct sockaddr *)&sin6, sizeof(sin6)) == -1)
zlog_fatal("bind-ipv6: bind: %s", strerror(errno));
}
int bp_udp6_shop(const struct vrf *vrf)
{
int sd;
frr_with_privs(&bglobal.bfdd_privs) {
sd = vrf_socket(AF_INET6, SOCK_DGRAM, PF_UNSPEC, vrf->vrf_id,
vrf->name);
}
if (sd == -1) {
if (errno != EAFNOSUPPORT)
zlog_fatal("udp6-shop: socket: %s", strerror(errno));
else
zlog_warn("udp6-shop: V6 is not supported, continuing");
return -1;
}
bp_set_ipv6opts(sd);
bp_bind_ipv6(sd, BFD_DEFDESTPORT);
return sd;
}
int bp_udp6_mhop(const struct vrf *vrf)
{
int sd;
frr_with_privs(&bglobal.bfdd_privs) {
sd = vrf_socket(AF_INET6, SOCK_DGRAM, PF_UNSPEC, vrf->vrf_id,
vrf->name);
}
if (sd == -1) {
if (errno != EAFNOSUPPORT)
zlog_fatal("udp6-mhop: socket: %s", strerror(errno));
else
zlog_warn("udp6-mhop: V6 is not supported, continuing");
return -1;
}
bp_set_ipv6opts(sd);
bp_bind_ipv6(sd, BFD_DEF_MHOP_DEST_PORT);
return sd;
}
#ifdef BFD_LINUX
/* tcpdump -dd udp dst port 3785 */
struct sock_filter my_filterudp[] = {
{0x28, 0, 0, 0x0000000c}, {0x15, 0, 8, 0x00000800},
{0x30, 0, 0, 0x00000017}, {0x15, 0, 6, 0x00000011},
{0x28, 0, 0, 0x00000014}, {0x45, 4, 0, 0x00001fff},
{0xb1, 0, 0, 0x0000000e}, {0x48, 0, 0, 0x00000010},
{0x15, 0, 1, 0x00000ec9}, {0x6, 0, 0, 0x00040000},
{0x6, 0, 0, 0x00000000},
};
#define MY_FILTER_LENGTH 11
int bp_echo_socket(const struct vrf *vrf)
{
int s;
frr_with_privs (&bglobal.bfdd_privs) {
s = vrf_socket(AF_PACKET, SOCK_RAW, ETH_P_IP, vrf->vrf_id,
vrf->name);
}
if (s == -1)
zlog_fatal("echo-socket: socket: %s", strerror(errno));
struct sock_fprog pf;
struct sockaddr_ll sll = {0};
/* adjust filter for socket to only receive ECHO packets */
pf.filter = my_filterudp;
pf.len = MY_FILTER_LENGTH;
if (setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &pf, sizeof(pf)) ==
-1) {
zlog_warn("%s: setsockopt(SO_ATTACH_FILTER): %s", __func__,
strerror(errno));
close(s);
return -1;
}
memset(&sll, 0, sizeof(sll));
sll.sll_family = AF_PACKET;
sll.sll_protocol = htons(ETH_P_IP);
sll.sll_ifindex = 0;
if (bind(s, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
zlog_warn("Failed to bind echo socket: %s",
safe_strerror(errno));
close(s);
return -1;
}
return s;
}
#else
int bp_echo_socket(const struct vrf *vrf)
{
int s;
frr_with_privs(&bglobal.bfdd_privs) {
s = vrf_socket(AF_INET, SOCK_DGRAM, 0, vrf->vrf_id, vrf->name);
}
if (s == -1)
zlog_fatal("echo-socket: socket: %s", strerror(errno));
bp_set_ipopts(s);
bp_bind_ip(s, BFD_DEF_ECHO_PORT);
return s;
}
#endif
int bp_echov6_socket(const struct vrf *vrf)
{
int s;
frr_with_privs(&bglobal.bfdd_privs) {
s = vrf_socket(AF_INET6, SOCK_DGRAM, 0, vrf->vrf_id, vrf->name);
}
if (s == -1) {
if (errno != EAFNOSUPPORT)
zlog_fatal("echov6-socket: socket: %s",
strerror(errno));
else
zlog_warn("echov6-socket: V6 is not supported, continuing");
return -1;
}
bp_set_ipv6opts(s);
bp_bind_ipv6(s, BFD_DEF_ECHO_PORT);
return s;
}
#ifdef BFD_LINUX
/* get peer's mac address to be used with Echo packets when they are looped in
* peers forwarding plane
*/
static void bfd_peer_mac_set(int sd, struct bfd_session *bfd, struct sockaddr_any *peer,
struct interface *ifp)
{
struct arpreq arpreq_;
if (CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_MAC_SET))
return;
if (CHECK_FLAG(ifp->flags, IFF_NOARP))
return;
if (peer->sa_sin.sin_family == AF_INET) {
/* IPV4 */
struct sockaddr_in *addr =
(struct sockaddr_in *)&arpreq_.arp_pa;
memset(&arpreq_, 0, sizeof(struct arpreq));
addr->sin_family = AF_INET;
memcpy(&addr->sin_addr.s_addr, &peer->sa_sin.sin_addr,
sizeof(addr->sin_addr));
strlcpy(arpreq_.arp_dev, ifp->name, sizeof(arpreq_.arp_dev));
if (ioctl(sd, SIOCGARP, &arpreq_) < 0) {
if (bglobal.debug_network)
zlog_debug(
"BFD: getting peer's mac on %s failed error %s",
ifp->name, strerror(errno));
UNSET_FLAG(bfd->flags, BFD_SESS_FLAG_MAC_SET);
memset(bfd->peer_hw_addr, 0, sizeof(bfd->peer_hw_addr));
} else {
memcpy(bfd->peer_hw_addr, arpreq_.arp_ha.sa_data,
sizeof(bfd->peer_hw_addr));
SET_FLAG(bfd->flags, BFD_SESS_FLAG_MAC_SET);
}
}
}
#endif
int _ptm_sbfd_init_send(struct bfd_session *bfd, const void *data, size_t datalen)
{
#ifdef BFD_LINUX
int sd = -1;
struct bfd_vrf_global *bvrf = bfd_vrf_look_by_session(bfd);
int seg_num;
struct in6_addr *segment_list = NULL;
struct in6_addr peer;
struct in6_addr local;
if (!bvrf)
return -1;
seg_num = bfd->segnum;
if (seg_num > 0)
segment_list = bfd->seg_list;
sd = bfd->sock;
local = bfd->key.local;
peer = bfd->key.peer;
/*SBFD Control pkt dst port should be 7784, src port can be any but NOT 7784 according to RFC7881 */
if (bp_raw_sbfd_red_send(sd, (uint8_t *)data, datalen, bfd->key.family, &bfd->out_sip6,
&local, &peer,
CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_MH) ? BFD_DEF_MHOP_DEST_PORT
: BFD_DEFDESTPORT,
BFD_DEF_SBFD_DEST_PORT, seg_num, segment_list) < 0) {
if (bfd->stats.tx_fail_pkt <= 1) {
char dst[INET6_ADDRSTRLEN] = { 0 };
inet_ntop(AF_INET6, seg_num > 0 ? segment_list : (&bfd->key.peer), dst,
sizeof(dst));
zlog_err("sbfd initiator send failed, dst:%s, errno:%s", dst,
safe_strerror(errno));
}
bfd->stats.tx_fail_pkt++;
return -1;
}
if (bfd->stats.tx_fail_pkt > 0) {
char dst[INET6_ADDRSTRLEN] = { 0 };
inet_ntop(AF_INET6, seg_num > 0 ? segment_list : (&bfd->key.peer), dst, sizeof(dst));
zlog_warn("sbfd initiator send success, dst:%s, previous tx_fail_pkt:%d", dst,
(int)bfd->stats.tx_fail_pkt);
}
bfd->stats.tx_fail_pkt = 0;
bfd->stats.tx_ctrl_pkt++;
#endif
return 0;
}
static int _ptm_sbfd_echo_send(struct bfd_session *bfd, const void *data, size_t datalen)
{
#ifdef BFD_LINUX
int sd = -1;
struct bfd_vrf_global *bvrf = bfd_vrf_look_by_session(bfd);
int seg_num;
struct in6_addr *segment_list = NULL;
struct in6_addr peer;
struct in6_addr local;
if (!bvrf)
return -1;
seg_num = bfd->segnum;
if (seg_num > 0)
segment_list = bfd->seg_list;
sd = bfd->sock;
local = bfd->key.local;
peer = bfd->key.peer;
/*SBFD echo pkt dst port should use BFD Echo port 3785, src port can be any according to RFC7881*/
if (bp_raw_sbfd_red_send(sd, (uint8_t *)data, datalen, bfd->key.family, &bfd->out_sip6,
&local, &peer, BFD_DEF_ECHO_PORT, BFD_DEF_ECHO_PORT, seg_num,
segment_list) < 0) {
if (bfd->stats.tx_fail_pkt <= 1) {
char dst[INET6_ADDRSTRLEN] = { 0 };
inet_ntop(AF_INET6, seg_num > 0 ? segment_list : (&bfd->key.peer), dst,
sizeof(dst));
zlog_err("sbfd echo send failed, bfd_name:%s, dst:%s, errno:%s",
bfd->bfd_name, dst, safe_strerror(errno));
}
bfd->stats.tx_fail_pkt++;
return -1;
}
if (bfd->stats.tx_fail_pkt > 0) {
char dst[INET6_ADDRSTRLEN] = { 0 };
inet_ntop(AF_INET6, seg_num > 0 ? segment_list : (&bfd->key.peer), dst, sizeof(dst));
zlog_warn("sbfd echo send success, bfd_name:%s, dst:%s, previous tx_fail_pkt:%d",
bfd->bfd_name, dst, (int)bfd->stats.tx_fail_pkt);
}
bfd->stats.tx_fail_pkt = 0;
bfd->stats.tx_echo_pkt++;
#endif
return 0;
}
void ptm_sbfd_initiator_snd(struct bfd_session *bfd, int fbit)
{
struct bfd_pkt cp = {};
/* Set fields according to section 6.5.7 */
cp.diag = bfd->local_diag;
BFD_SETVER(cp.diag, BFD_VERSION);
cp.flags = 0;
BFD_SETSTATE(cp.flags, bfd->ses_state);
if (CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_CBIT))
BFD_SETCBIT(cp.flags, BFD_CBIT);
BFD_SETDEMANDBIT(cp.flags, BFD_SBFD_INITIATOR_DEMAND);
/*
* Polling and Final can't be set at the same time.
*
* RFC 5880, Section 6.5.
*/
BFD_SETFBIT(cp.flags, fbit);
if (fbit == 0)
BFD_SETPBIT(cp.flags, bfd->polling);
cp.detect_mult = bfd->detect_mult;
cp.len = BFD_PKT_LEN;
cp.discrs.my_discr = htonl(bfd->discrs.my_discr);
cp.discrs.remote_discr = htonl(bfd->discrs.remote_discr);
if (bfd->polling) {
cp.timers.desired_min_tx = htonl(bfd->timers.desired_min_tx);
} else {
/*
* We can only announce current setting on poll, this
* avoids timing mismatch with our peer and give it
* the oportunity to learn. See `bs_final_handler` for
* more information.
*/
cp.timers.desired_min_tx = htonl(bfd->cur_timers.desired_min_tx);
}
cp.timers.required_min_rx = 0;
cp.timers.required_min_echo = 0;
if (_ptm_sbfd_init_send(bfd, &cp, BFD_PKT_LEN) != 0)
return;
bfd->stats.tx_ctrl_pkt++;
}
void ptm_sbfd_echo_snd(struct bfd_session *bfd)
{
struct bfd_echo_pkt bep;
memset(&bep, 0, sizeof(bep));
BFD_SETVER(bep.ver, BFD_ECHO_VERSION);
bep.len = BFD_ECHO_PKT_LEN;
bep.my_discr = htonl(bfd->discrs.my_discr);
if (_ptm_sbfd_echo_send(bfd, &bep, BFD_ECHO_PKT_LEN) != 0)
return;
if (!CHECK_FLAG(bfd->flags, BFD_SESS_FLAG_ECHO_ACTIVE))
SET_FLAG(bfd->flags, BFD_SESS_FLAG_ECHO_ACTIVE);
}
static int ptm_bfd_reflector_process_init_packet(struct bfd_vrf_global *bvrf, int sd)
{
//uint32_t my_discr = 0;
//uint32_t remote_discr = 0;
uint8_t ttl = 0;
struct sockaddr *sa;
struct sbfd_reflector *sr;
/* Receive and parse echo packet. */
struct bfd_pkt *cp;
ssize_t rlen;
struct sockaddr_any local, peer;
ifindex_t ifindex = IFINDEX_INTERNAL;
//vrf_id_t vrfid = VRF_DEFAULT;
uint8_t msgbuf[1516];
rlen = bfd_recv_ipv6(sd, msgbuf, sizeof(msgbuf), &ttl, &ifindex, &local, &peer);
/* Short packet, better not risk reading it. */
if (rlen < (ssize_t)sizeof(*cp)) {
zlog_debug("small bfd packet");
return 0;
}
cp = (struct bfd_pkt *)(msgbuf);
if (!CHECK_FLAG(cp->flags, BFD_DEMANDBIT)) {
/*Control Packet from SBFDInitiator should have Demand bit set to 1 according to RFC7880*/
return 0;
}
sr = sbfd_discr_lookup(ntohl(cp->discrs.remote_discr));
if (sr) {
uint32_t temp = cp->discrs.my_discr;
cp->discrs.my_discr = cp->discrs.remote_discr;
cp->discrs.remote_discr = temp;
UNSET_FLAG(cp->flags, BFD_DEMANDBIT);
BFD_SETSTATE(cp->flags, PTM_BFD_UP);
if (CHECK_FLAG(cp->flags, BFD_PBIT)) {
UNSET_FLAG(cp->flags, BFD_PBIT);
SET_FLAG(cp->flags, BFD_FBIT);
}
sa = (struct sockaddr *)&peer.sa_sin6;
if (sendto(sd, msgbuf, rlen, 0, sa, sizeof(peer.sa_sin6)) <= 0) {
zlog_debug("packet-send: send failure: %s", strerror(errno));
return -1;
}
} else {
zlog_debug("no reflector found in %u", cp->discrs.remote_discr);
}
return 0;
}
int bp_peer_srh_socketv6(struct bfd_session *bs)
{
int sd; //, pcount;
//struct sockaddr_in6 sin6;
//static int srcPort = BFD_SRCPORTINIT;
const char *device_to_bind = NULL;
if (bs->key.ifname[0]) {
device_to_bind = (const char *)bs->key.ifname;
zlog_debug("device_to_bind to ifname:%s", device_to_bind);
} else if (CHECK_FLAG(bs->flags, BFD_SESS_FLAG_MH) && bs->key.vrfname[0]) {
device_to_bind = (const char *)bs->key.vrfname;
zlog_debug("device_to_bind to vrf:%s", device_to_bind);
} else {
zlog_debug("device_to_bind to NULL");
}
frr_with_privs (&bglobal.bfdd_privs) {
sd = vrf_socket(AF_INET6, SOCK_RAW, IPPROTO_RAW, bs->vrf->vrf_id, device_to_bind);
}
if (sd == -1) {
zlog_err("ipv6-new: failed to create socket: %s", strerror(errno));
return -1;
}
/* Set TTL to 255 for all transmitted packets */
if (bp_set_ttlv6(sd, BFD_TTL_VAL) != 0) {
close(sd);
return -1;
}
/* Set TOS to CS6 for all transmitted packets */
if (bp_set_tosv6(sd, BFD_TOS_VAL) != 0) {
close(sd);
return -1;
}
#ifdef IPV6_HDRINCL
int on = 1;
/*manage the IP6 header all on own onwn*/
if (setsockopt(sd, IPPROTO_IPV6, IPV6_HDRINCL, &on, sizeof(on))) {
#else
if (true) {
#endif
zlog_err("setsockopt IPV6_HDRINCL error: %s", strerror(errno));
close(sd);
return -1;
}
return sd;
}
int bp_initv6_socket(const struct vrf *vrf)
{
int sd;
frr_with_privs (&bglobal.bfdd_privs) {
sd = vrf_socket(AF_INET6, SOCK_DGRAM, 0, vrf->vrf_id, vrf->name);
}
if (sd == -1) {
if (errno != EAFNOSUPPORT)
zlog_fatal("echov6-socket: socket: %s", strerror(errno));
else
zlog_warn("echov6-socket: V6 is not supported, continuing");
return -1;
}
bp_set_ipv6opts(sd);
bp_bind_ipv6(sd, BFD_DEF_SBFD_DEST_PORT);
return sd;
}
#ifdef BFD_LINUX
static uint16_t checksum(uint16_t *addr, int len)
{
int count = len;
uint16_t answer = 0;
register uint32_t sum = 0;
// Sum up 2-byte values until none or only one byte left.
while (count > 1) {
sum += *(addr++);
count -= 2;
}
// Add left-over byte, if any.
if (count > 0)
sum += *(uint8_t *)addr;
// Fold 32-bit sum into 16 bits; we lose information by doing this,
// increasing the chances of a collision.
// sum = (lower 16 bits) + (upper 16 bits shifted right 16 bits)
while (sum >> 16)
sum = (sum & 0xffff) + (sum >> 16);
// Checksum is one's compliment of sum.
answer = ~sum;
return answer;
}
static uint16_t udp6_checksum(struct ip6_hdr iphdr, struct udphdr udp_hdr, uint8_t *payload,
int payloadlen)
{
char buf[IP_MAXPACKET];
char *ptr;
int chksumlen = 0;
int i;
ptr = &buf[0]; // ptr points to beginning of buffer buf
// Copy source IP address into buf (128 bits)
memcpy(ptr, &iphdr.ip6_src.s6_addr, sizeof(iphdr.ip6_src.s6_addr));
ptr += sizeof(iphdr.ip6_src.s6_addr);
chksumlen += sizeof(iphdr.ip6_src.s6_addr);
// Copy destination IP address into buf (128 bits)
memcpy(ptr, &iphdr.ip6_dst.s6_addr, sizeof(iphdr.ip6_dst.s6_addr));
ptr += sizeof(iphdr.ip6_dst.s6_addr);
chksumlen += sizeof(iphdr.ip6_dst.s6_addr);
// Copy UDP length into buf (32 bits)
memcpy(ptr, &udp_hdr.len, sizeof(udp_hdr.len));
ptr += sizeof(udp_hdr.len);
chksumlen += sizeof(udp_hdr.len);
// Copy zero field to buf (24 bits)
*ptr = 0;
ptr++;
*ptr = 0;
ptr++;
*ptr = 0;
ptr++;
chksumlen += 3;
// Copy next header field to buf (8 bits)
memcpy(ptr, &iphdr.ip6_nxt, sizeof(iphdr.ip6_nxt));
ptr += sizeof(iphdr.ip6_nxt);
chksumlen += sizeof(iphdr.ip6_nxt);
// Copy UDP source port to buf (16 bits)
memcpy(ptr, &udp_hdr.source, sizeof(udp_hdr.source));
ptr += sizeof(udp_hdr.source);
chksumlen += sizeof(udp_hdr.source);
// Copy UDP destination port to buf (16 bits)
memcpy(ptr, &udp_hdr.dest, sizeof(udp_hdr.dest));
ptr += sizeof(udp_hdr.dest);
chksumlen += sizeof(udp_hdr.dest);
// Copy UDP length again to buf (16 bits)
memcpy(ptr, &udp_hdr.len, sizeof(udp_hdr.len));
ptr += sizeof(udp_hdr.len);
chksumlen += sizeof(udp_hdr.len);
// Copy UDP checksum to buf (16 bits)
// Zero, since we don't know it yet
*ptr = 0;
ptr++;
*ptr = 0;
ptr++;
chksumlen += 2;
// Copy payload to buf
memcpy(ptr, payload, payloadlen * sizeof(uint8_t));
ptr += payloadlen;
chksumlen += payloadlen;
// Pad to the next 16-bit boundary
for (i = 0; i < payloadlen % 2; i++, ptr++) {
*ptr = 0;
ptr++;
chksumlen++;
}
return checksum((uint16_t *)buf, chksumlen);
}
// Build IPv4 UDP pseudo-header and call checksum function.
static uint16_t udp4_checksum(struct ip iphdr, struct udphdr udp_hdr, uint8_t *payload,
int payloadlen)
{
char buf[IP_MAXPACKET];
char *ptr;
int chksumlen = 0;
int i;
ptr = &buf[0]; // ptr points to beginning of buffer buf
// Copy source IP address into buf (32 bits)
memcpy(ptr, &iphdr.ip_src.s_addr, sizeof(iphdr.ip_src.s_addr));
ptr += sizeof(iphdr.ip_src.s_addr);
chksumlen += sizeof(iphdr.ip_src.s_addr);
// Copy destination IP address into buf (32 bits)
memcpy(ptr, &iphdr.ip_dst.s_addr, sizeof(iphdr.ip_dst.s_addr));
ptr += sizeof(iphdr.ip_dst.s_addr);
chksumlen += sizeof(iphdr.ip_dst.s_addr);
// Copy zero field to buf (8 bits)
*ptr = 0;
ptr++;
chksumlen += 1;
// Copy transport layer protocol to buf (8 bits)
memcpy(ptr, &iphdr.ip_p, sizeof(iphdr.ip_p));
ptr += sizeof(iphdr.ip_p);
chksumlen += sizeof(iphdr.ip_p);
// Copy UDP length to buf (16 bits)
memcpy(ptr, &udp_hdr.len, sizeof(udp_hdr.len));
ptr += sizeof(udp_hdr.len);
chksumlen += sizeof(udp_hdr.len);
// Copy UDP source port to buf (16 bits)
memcpy(ptr, &udp_hdr.source, sizeof(udp_hdr.source));
ptr += sizeof(udp_hdr.source);
chksumlen += sizeof(udp_hdr.source);
// Copy UDP destination port to buf (16 bits)
memcpy(ptr, &udp_hdr.dest, sizeof(udp_hdr.dest));
ptr += sizeof(udp_hdr.dest);
chksumlen += sizeof(udp_hdr.dest);
// Copy UDP length again to buf (16 bits)
memcpy(ptr, &udp_hdr.len, sizeof(udp_hdr.len));
ptr += sizeof(udp_hdr.len);
chksumlen += sizeof(udp_hdr.len);
// Copy UDP checksum to buf (16 bits)
// Zero, since we don't know it yet
*ptr = 0;
ptr++;
*ptr = 0;
ptr++;
chksumlen += 2;
// Copy payload to buf
memcpy(ptr, payload, payloadlen);
ptr += payloadlen;
chksumlen += payloadlen;
// Pad to the next 16-bit boundary
for (i = 0; i < payloadlen % 2; i++, ptr++) {
*ptr = 0;
ptr++;
chksumlen++;
}
return checksum((uint16_t *)buf, chksumlen);
}
#endif
int bp_sbfd_socket(const struct vrf *vrf)
{
int s;
frr_with_privs (&bglobal.bfdd_privs) {
s = vrf_socket(AF_INET6, SOCK_RAW, IPPROTO_RAW, vrf->vrf_id, vrf->name);
}
if (s == -1) {
if (errno != EAFNOSUPPORT)
zlog_fatal("sbfdv6-socket: socket: %s", strerror(errno));
else
zlog_warn("sbfdv6-socket: V6 is not supported, continuing");
return -1;
}
bp_set_prio(s, SOCK_OPT_PRIO_HIGH);
return s;
}
#ifdef BFD_LINUX
static void bp_sbfd_encap_srh_ip6h_red(struct ip6_hdr *srh_ip6h, struct in6_addr *sip,
struct in6_addr *dip, uint8_t seg_num, size_t datalen,
uint16_t family)
{
/* SRH IPv6 Header */
srh_ip6h->ip6_flow = (BFD_TOS_VAL << 20);
srh_ip6h->ip6_vfc = 6 << 4;
if (seg_num == 1) {
if (family == AF_INET6) {
srh_ip6h->ip6_plen =
htons(sizeof(struct ip6_hdr) + sizeof(struct udphdr) + datalen);
srh_ip6h->ip6_nxt = IPPROTO_IPV6;
} else {
srh_ip6h->ip6_plen =
htons(sizeof(struct ip) + sizeof(struct udphdr) + datalen);
srh_ip6h->ip6_nxt = IPPROTO_IPIP;
}
} else {
srh_ip6h->ip6_plen = htons(sizeof(struct ip6_hdr) + sizeof(struct udphdr) +
sizeof(struct ipv6_sr_hdr) +
sizeof(struct in6_addr) * (seg_num - 1) + datalen);
srh_ip6h->ip6_nxt = IPPROTO_ROUTING;
}
srh_ip6h->ip6_hlim = BFD_TTL_VAL;
memcpy(&(srh_ip6h->ip6_src), sip, sizeof(struct in6_addr));
memcpy(&(srh_ip6h->ip6_dst), dip, sizeof(struct in6_addr));
}
static void bp_sbfd_encap_srh_rth_red(struct ipv6_sr_hdr *srv6h, struct in6_addr *segment_list,
uint8_t seg_num)
{
//caller should make sure: seg_num > 1
srv6h->nexthdr = IPPROTO_IPV6;
srv6h->hdrlen =
GET_RTH_HDR_LEN(RTH_BASE_HEADER_LEN + sizeof(struct in6_addr) * (seg_num - 1));
srv6h->type = IPV6_SRCRT_TYPE_4;
srv6h->segments_left = seg_num - 1; //if encap reduce mode , seg_num-1
srv6h->first_segment = seg_num - 2; //if encap reduce mode , seg_num-2
srv6h->flags = 0;
srv6h->tag = 0;
for (int i = 0; i < seg_num - 1; i++)
memcpy(&srv6h->segments[i], &segment_list[seg_num - 1 - i], sizeof(struct in6_addr));
}
static void bp_sbfd_encap_inner_ip6h(struct ip6_hdr *ip6h, struct in6_addr *sip,
struct in6_addr *dip, size_t datalen)
{
/* IPv6 Header */
ip6h->ip6_flow = (BFD_TOS_VAL << 20);
ip6h->ip6_vfc = 6 << 4;
ip6h->ip6_plen = htons(sizeof(struct udphdr) + datalen);
ip6h->ip6_nxt = IPPROTO_UDP;
ip6h->ip6_hlim = BFD_TTL_VAL;
memcpy(&(ip6h->ip6_src), sip, sizeof(struct in6_addr));
memcpy(&(ip6h->ip6_dst), dip, sizeof(struct in6_addr));
}
static void bp_sbfd_encap_inner_iph(struct ip *iph, struct in6_addr *sip, struct in6_addr *dip,
size_t datalen)
{
/* IPv4 Header */
iph->ip_v = 4;
iph->ip_hl = 5;
iph->ip_tos = BFD_TOS_VAL;
iph->ip_len = htons(sizeof(struct ip) + sizeof(struct udphdr) + datalen);
iph->ip_id = (uint16_t)frr_weak_random();
iph->ip_ttl = BFD_TTL_VAL;
iph->ip_p = IPPROTO_UDP;
iph->ip_sum = 0;
memcpy(&iph->ip_src, sip, sizeof(iph->ip_src));
memcpy(&iph->ip_dst, dip, sizeof(iph->ip_dst));
}
static void bp_sbfd_encap_udp6(struct udphdr *udph, struct ip6_hdr *ip6h, uint16_t src_port,
uint16_t dst_port, uint8_t *payload, int payloadlen)
{
udph->source = htons(src_port);
udph->dest = htons(dst_port);
udph->len = htons(sizeof(struct udphdr) + payloadlen);
udph->check = udp6_checksum(*ip6h, *udph, payload, payloadlen);
}
static void bp_sbfd_encap_udp4(struct udphdr *udph, struct ip *iph, uint16_t src_port,
uint16_t dst_port, uint8_t *payload, int payloadlen)
{
udph->source = htons(src_port);
udph->dest = htons(dst_port);
udph->len = htons(sizeof(struct udphdr) + payloadlen);
udph->check = udp4_checksum(*iph, *udph, payload, payloadlen);
}
/**
* @brief encap srv6 to send raw socker red mode, just support ecore 2.5 case
*
* @param sd sokcet
* @param data actual data, e.g. bfd packet or bfd echo packet
* @param datalen actual data length
* @param sip source ip address of outer ipv6 header and inner ipv6 header
* @param dip destination ip address of inner ipv6 header
* @param src_port source port of udp
* @param dst_port destination port of udp
* @param seg_num segment number of srh header
* @param segment_list segment list of srh header and the last one segment is destination ip address of outer ipv6 header
* @param ifname out interface name
* @param vrfname vrf name
* @param nhp specified nexthop
* @return int
*/
static int bp_raw_sbfd_red_send(int sd, uint8_t *data, size_t datalen, uint16_t family,
struct in6_addr *out_sip, struct in6_addr *sip,
struct in6_addr *dip, uint16_t src_port, uint16_t dst_port,
uint8_t seg_num, struct in6_addr *segment_list)
{
static uint8_t sendbuf[BUF_SIZ];
struct msghdr msg = { 0 };
struct iovec iov;
int flags = 0;
int ret = 0;
struct ip6_hdr *srh_ip6h;
struct ipv6_sr_hdr *psrv6h; // srh Routing header
struct ip6_hdr *ip6h;
struct ip *iph;
struct udphdr *udp;
uint8_t *payload;
struct ipaddr out_sip_addr = { 0 };
struct sockaddr_in6 dst_sin6 = { 0 };
char buf_addr[INET6_ADDRSTRLEN] = { 0 };
memset(sendbuf, 0, sizeof(sendbuf));
int total_len = 0;
/* SRH IPv6 Header */
if (seg_num > 0) {
memcpy(&out_sip_addr.ipaddr_v6, out_sip, sizeof(struct in6_addr));
srh_ip6h = (struct ip6_hdr *)(sendbuf + total_len);
bp_sbfd_encap_srh_ip6h_red(srh_ip6h, &out_sip_addr.ipaddr_v6, &segment_list[0],
seg_num, datalen, family);
total_len += sizeof(struct ip6_hdr);
memcpy(&dst_sin6.sin6_addr, &segment_list[0], sizeof(struct in6_addr));
}
//case with srh header
if (seg_num > 1) {
psrv6h = (struct ipv6_sr_hdr *)(sendbuf + total_len);
bp_sbfd_encap_srh_rth_red(psrv6h, segment_list, seg_num);
total_len += sizeof(struct ipv6_sr_hdr) + sizeof(struct in6_addr) * (seg_num - 1);
}
if (family == AF_INET6) {
if (seg_num == 0)
memcpy(&dst_sin6.sin6_addr, dip, sizeof(struct in6_addr));
/* Inner IPv6 Header */
ip6h = (struct ip6_hdr *)(sendbuf + total_len);
bp_sbfd_encap_inner_ip6h(ip6h, sip, dip, datalen);
total_len += sizeof(struct ip6_hdr);
/* UDP Header */
udp = (struct udphdr *)(sendbuf + total_len);
bp_sbfd_encap_udp6(udp, ip6h, src_port, dst_port, data, datalen);
total_len += sizeof(struct udphdr);
} else {
if (seg_num == 0) {
//should never come to here, just print a error hint
zlog_err("%s error, empty sidlist for ipv4 bfd", __func__);
}
/* Inner IPv4 Header */
iph = (struct ip *)(sendbuf + total_len);
bp_sbfd_encap_inner_iph(iph, sip, dip, datalen);
total_len += sizeof(struct ip);
/* UDP Header */
udp = (struct udphdr *)(sendbuf + total_len);
bp_sbfd_encap_udp4(udp, iph, src_port, dst_port, data, datalen);
total_len += sizeof(struct udphdr);
iph->ip_sum = in_cksum((const void *)iph, sizeof(struct ip));
}
/* BFD payload*/
payload = (uint8_t *)(sendbuf + total_len);
memcpy(payload, data, datalen);
total_len += datalen;
dst_sin6.sin6_family = AF_INET6;
dst_sin6.sin6_port = 0; //we don't use sin6_port in raw, but should set to 0!!
/* message data. */
iov.iov_base = (uint8_t *)sendbuf;
iov.iov_len = total_len;
msg.msg_name = &dst_sin6;
msg.msg_namelen = sizeof(struct sockaddr_in6);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
/* sendmsg */
ret = sendmsg(sd, &msg, flags);
if (ret < 0) {
inet_ntop(AF_INET6, &dst_sin6.sin6_addr, buf_addr, INET6_ADDRSTRLEN);
zlog_debug("sbfd send to:%s failed , ret:%d, errno:%s", buf_addr, ret,
safe_strerror(errno));
return ret;
}
return 0;
}
#endif