tests: Fix ospf[6]_gr_topo1 tests to work better under load

2 things:

a) Each test was setting up for graceful restart with calls to
`graceful-restart prepare ip[v6] ospf`, then sleeping for
3 or 5 seconds.  Then killing the ospf process.  Under heavy
load there is no guarantee that zebra has received/processed
this signal.  Write some code to ensure that this happens

b) Tests are issuing commands in this order:
   1) issue gr prepare command
   2) kill router
   3) <ensure routes were still installed in zebra>
   4) start router
   5) <ensure routes were stil installed in zebra>

Imagine that the system is under some load and there is
a small amount of time before step 5 happens.  In this
case ospf could have come up and started neighbor relations
and also started installing routes.  If zebra receives
a new route before step 5 is issued then the route could
be in a state where it is not installed, because it is
being sent to the kernel for installation.  This would
fail the test because it would only look 1 time.  This
is fixed by giving time on restart for the routes to
be in the installed state.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
This commit is contained in:
Donald Sharp 2021-10-08 07:37:15 -04:00
parent 76ab1a9702
commit 6255aad0bc
2 changed files with 78 additions and 17 deletions

View file

@ -175,8 +175,19 @@ def check_routers(initial_convergence=False, exiting=None, restarting=None):
for rname in ["rt1", "rt2", "rt3", "rt4", "rt5", "rt6", "rt7"]:
# Check the RIB first, which should be preserved across restarts in
# all routers of the routing domain.
# If we are not on initial convergence *but* we are checking
# after a restart. Looking in the zebra rib for installed
# is a recipe for test failure. Why? because if we are restarting
# then ospf is in the process of establishing neighbors and passing
# new routes to zebra. Zebra will not mark the route as installed
# when it receives a replacement from ospf until it has finished
# processing it. Let's give it a few seconds to allow this to happen
# under load.
if initial_convergence == True:
tries = 240
else:
if restarting != None:
tries = 40
else:
tries = 1
router_compare_json_output(
@ -212,6 +223,26 @@ def check_routers(initial_convergence=False, exiting=None, restarting=None):
)
def ensure_gr_is_in_zebra(rname):
retry = True
retry_times = 10
tgen = get_topogen()
while retry and retry_times > 0:
out = tgen.net[rname].cmd(
'vtysh -c "show zebra client" | grep "Client: ospf6$" -A 40 | grep "Capabilities "'
)
if "Graceful Restart" not in out:
sleep(2)
retry_times -= 1
else:
retry = False
assertmsg = "%s does not appear to have Graceful Restart setup" % rname
assert not retry and retry_times > 0, assertmsg
#
# Test initial network convergence
#
@ -238,10 +269,9 @@ def test_gr_rt1():
pytest.skip(tgen.errors)
tgen.net["rt1"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
sleep(5)
ensure_gr_is_in_zebra("rt1")
kill_router_daemons(tgen, "rt1", ["ospf6d"], save_config=False)
check_routers(exiting="rt1")
start_router_daemons(tgen, "rt1", ["ospf6d"])
check_routers(restarting="rt1")
@ -258,7 +288,7 @@ def test_gr_rt2():
pytest.skip(tgen.errors)
tgen.net["rt2"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
sleep(5)
ensure_gr_is_in_zebra("rt2")
kill_router_daemons(tgen, "rt2", ["ospf6d"], save_config=False)
check_routers(exiting="rt2")
@ -278,7 +308,7 @@ def test_gr_rt3():
pytest.skip(tgen.errors)
tgen.net["rt3"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
sleep(5)
ensure_gr_is_in_zebra("rt3")
kill_router_daemons(tgen, "rt3", ["ospf6d"], save_config=False)
check_routers(exiting="rt3")
@ -298,7 +328,7 @@ def test_gr_rt4():
pytest.skip(tgen.errors)
tgen.net["rt4"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
sleep(5)
ensure_gr_is_in_zebra("rt4")
kill_router_daemons(tgen, "rt4", ["ospf6d"], save_config=False)
check_routers(exiting="rt4")
@ -318,7 +348,7 @@ def test_gr_rt5():
pytest.skip(tgen.errors)
tgen.net["rt5"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
sleep(5)
ensure_gr_is_in_zebra("rt5")
kill_router_daemons(tgen, "rt5", ["ospf6d"], save_config=False)
check_routers(exiting="rt5")
@ -338,7 +368,7 @@ def test_gr_rt6():
pytest.skip(tgen.errors)
tgen.net["rt6"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
sleep(5)
ensure_gr_is_in_zebra("rt6")
kill_router_daemons(tgen, "rt6", ["ospf6d"], save_config=False)
check_routers(exiting="rt6")
@ -358,7 +388,7 @@ def test_gr_rt7():
pytest.skip(tgen.errors)
tgen.net["rt7"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
sleep(5)
ensure_gr_is_in_zebra("rt7")
kill_router_daemons(tgen, "rt7", ["ospf6d"], save_config=False)
check_routers(exiting="rt7")

View file

@ -184,8 +184,19 @@ def check_routers(initial_convergence=False, exiting=None, restarting=None):
for rname in ["rt1", "rt2", "rt3", "rt4", "rt5", "rt6", "rt7"]:
# Check the RIB first, which should be preserved across restarts in
# all routers of the routing domain.
# If we are not on initial convergence *but* we are checking
# after a restart. Looking in the zebra rib for installed
# is a recipe for test failure. Why? because if we are restarting
# then ospf is in the process of establishing neighbors and passing
# new routes to zebra. Zebra will not mark the route as installed
# when it receives a replacement from ospf until it has finished
# processing it. Let's give it a few seconds to allow this to happen
# under load.
if initial_convergence == True:
tries = 240
else:
if restarting != None:
tries = 40
else:
tries = 1
router_compare_json_output(
@ -215,6 +226,26 @@ def check_routers(initial_convergence=False, exiting=None, restarting=None):
)
def ensure_gr_is_in_zebra(rname):
retry = True
retry_times = 10
tgen = get_topogen()
while retry and retry_times > 0:
out = tgen.net[rname].cmd(
'vtysh -c "show zebra client" | grep "Client: ospf$" -A 40 | grep "Capabilities "'
)
if "Graceful Restart" not in out:
sleep(2)
retry_times -= 1
else:
retry = False
assertmsg = "%s does not appear to have Graceful Restart setup" % rname
assert not retry and retry_times > 0, assertmsg
#
# Test initial network convergence
#
@ -241,7 +272,7 @@ def test_gr_rt1():
pytest.skip(tgen.errors)
tgen.net["rt1"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
sleep(3)
ensure_gr_is_in_zebra("rt1")
kill_router_daemons(tgen, "rt1", ["ospfd"], save_config=False)
check_routers(exiting="rt1")
@ -261,7 +292,7 @@ def test_gr_rt2():
pytest.skip(tgen.errors)
tgen.net["rt2"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
sleep(3)
ensure_gr_is_in_zebra("rt2")
kill_router_daemons(tgen, "rt2", ["ospfd"], save_config=False)
check_routers(exiting="rt2")
@ -281,7 +312,7 @@ def test_gr_rt3():
pytest.skip(tgen.errors)
tgen.net["rt3"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
sleep(3)
ensure_gr_is_in_zebra("rt3")
kill_router_daemons(tgen, "rt3", ["ospfd"], save_config=False)
check_routers(exiting="rt3")
@ -301,7 +332,7 @@ def test_gr_rt4():
pytest.skip(tgen.errors)
tgen.net["rt4"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
sleep(3)
ensure_gr_is_in_zebra("rt4")
kill_router_daemons(tgen, "rt4", ["ospfd"], save_config=False)
check_routers(exiting="rt4")
@ -321,7 +352,7 @@ def test_gr_rt5():
pytest.skip(tgen.errors)
tgen.net["rt5"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
sleep(3)
ensure_gr_is_in_zebra("rt5")
kill_router_daemons(tgen, "rt5", ["ospfd"], save_config=False)
check_routers(exiting="rt5")
@ -341,7 +372,7 @@ def test_gr_rt6():
pytest.skip(tgen.errors)
tgen.net["rt6"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
sleep(3)
ensure_gr_is_in_zebra("rt6")
kill_router_daemons(tgen, "rt6", ["ospfd"], save_config=False)
check_routers(exiting="rt6")
@ -361,7 +392,7 @@ def test_gr_rt7():
pytest.skip(tgen.errors)
tgen.net["rt7"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
sleep(3)
ensure_gr_is_in_zebra("rt7")
kill_router_daemons(tgen, "rt7", ["ospfd"], save_config=False)
check_routers(exiting="rt7")