Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup after lbaas management script #30

Open
wants to merge 11 commits into
base: neutron-ha-tool-maintenance
Choose a base branch
from
6 changes: 2 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ python:
install:
- pip install -r files/default/test-requirements.txt
script:
- flake8 neutron-ha-tool.py test-neutron-ha-tool.py
- flake8 neutron-evacuate-lbaasv2-agent.py test-neutron-evacuate-lbaasv2-agent.py
- python files/default/test-neutron-ha-tool.py
- python files/default/test-neutron-evacuate-lbaasv2-agent.py
- flake8 *.py
- cd files/default && nosetests -v

6 changes: 3 additions & 3 deletions files/default/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ Make a python 2 virtual environment, activate it, and install the requirements:

Run the tests

coverage run test-neutron-ha-tool.py
coverage run $(which nosetests) .

Coverage report

coverage report -i neutron-ha-tool.py
coverage report -i $(pwd)/*.py

Code analysis

flake8 neutron-ha-tool.py test-neutron-ha-tool.py
flake8 *.py
90 changes: 38 additions & 52 deletions files/default/neutron-evacuate-lbaasv2-agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
'', os.path.join(dirname, '/usr/bin/neutron-ha-tool'))



class EvacuateLbaasV2Agent(object):

def __init__(self):
Expand Down Expand Up @@ -157,73 +156,60 @@ def run(self):
self.reassign_loadbalancers(load_balancers, target_agents)
)

if (cfg.CONF.source_agent_restart or
cfg.CONF.delete_namespaces):
if (cfg.CONF.source_agent_restart or cfg.CONF.delete_namespaces):
# Make sure the source agent is handled first
agents_to_restart.insert(0, self.host_to_evacuate)

LOG.info("agents to restart: %s" % agents_to_restart)

for host in agents_to_restart:
LOG.info("restarting agent on %s" % host)
cleanup = RemoteLbaasV2Cleanup(host, timeout=30)
if (host != self.host_to_evacuate or
cfg.CONF.source_agent_restart):
if (host != self.host_to_evacuate or cfg.CONF.source_agent_restart): # noqa
if cfg.CONF.use_crm:
cleanup.restart_lbaasv2_agent_crm()
restart_lbaasv2_agent_crm(host)
else:
cleanup.restart_lbaasv2_agent_systemd()
restart_lbaasv2_agent_systemd(host)

if (host == self.host_to_evacuate and
cfg.CONF.delete_namespaces):
cleanup.delete_lbaasv2_namespaces(load_balancers)
if (host == self.host_to_evacuate and cfg.CONF.delete_namespaces): # noqa
delete_lbaasv2_namespaces(host, load_balancers)
else:
LOG.info("The agent on %s is not hosting any loadbalancers.",
self.host_to_evacuate)
return(0)


class RemoteLbaasV2Cleanup(hatool.RemoteNodeCleanup):

def restart_lbaasv2_agent_crm(self):
self._ssh_connect()
with self.ssh_client:
try:
self._simple_ssh_command("crm --wait node maintenance")
self._simple_ssh_command(
"systemctl restart openstack-neutron-lbaasv2-agent")
self._simple_ssh_command("crm --wait node ready")
except socket.timeout:
LOG.warn("SSH timeout exceeded. Failed to restart "
"openstack-neutron-lbaasv2-agent on %s",
self.target_host)

def restart_lbaasv2_agent_systemd(self):
self._ssh_connect()
with self.ssh_client:
try:
self._simple_ssh_command(
"systemctl restart openstack-neutron-lbaasv2-agent")
except socket.timeout:
LOG.warn("SSH timeout exceeded. Failed to restart "
"openstack-neutron-lbaasv2-agent on %s",
self.target_host)

def delete_lbaasv2_namespaces(self, loadbalancer_ids):
self._ssh_connect()
with self.ssh_client:
for lb in loadbalancer_ids:
namespace = "qlbaas-" + lb
LOG.info("deleting namespace %s on %s",
namespace, self.target_host)
try:
if self._namespace_exists(namespace):
self._kill_pids_in_namespace(namespace)
self._simple_ssh_command(self.netns_del + namespace)
except socket.timeout:
LOG.warn("SSH timeout exceeded. Failed to delete "
"namespace %s on %s", namespace,
self.target_host)
def restart_lbaasv2_agent_crm(hostname):
with hatool.connect_to_host(hostname, 30) as host:
host.run_timeout = 30
try:
host.run("crm --wait node maintenance")
host.run(
"systemctl restart openstack-neutron-lbaasv2-agent")
host.run("crm --wait node ready")
except socket.timeout:
LOG.warn("SSH timeout exceeded. Failed to restart "
"openstack-neutron-lbaasv2-agent on %s",
host)


def restart_lbaasv2_agent_systemd(hostname):
with hatool.connect_to_host(hostname, 30) as host:
host.run_timeout = 30
try:
host.run(
"systemctl restart openstack-neutron-lbaasv2-agent")
except socket.timeout:
LOG.warn("SSH timeout exceeded. Failed to restart "
"openstack-neutron-lbaasv2-agent on %s",
host)


def delete_lbaasv2_namespaces(hostname, loadbalancer_ids):
with hatool.connect_to_host(hostname, 30) as host:
host.run_timeout = 30
for lb in loadbalancer_ids:
namespace = "qlbaas-" + lb
hatool.Namespace(host, namespace).destroy()


if __name__ == '__main__':
Expand Down
123 changes: 66 additions & 57 deletions files/default/neutron-ha-tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,8 +719,7 @@ def migrate_router(qclient, router, agent, target,
wait_router_migrated(qclient, router['id'], target['host'])

if delete_namespace:
nscleanup = RemoteRouterNsCleanup(agent['host'])
nscleanup.delete_router_namespace(router['id'])
destroy_router_namespace(agent['host'], router['id'])


def wait_router_migrated(qclient, router_id, target_host, maxtries=60):
Expand Down Expand Up @@ -1074,53 +1073,32 @@ def pick(self):
)


class RemoteNodeCleanup(object):
def __init__(self, host, timeout=10):
self.target_host = host
self.timeout = timeout
self.netns_del = "ip netns delete "
self.netns_pids = "ip netns pids "
self.netns_list = "ip netns list"

def _ssh_connect(self):
self.ssh_client = paramiko.SSHClient()
self.ssh_client.set_missing_host_key_policy(
paramiko.AutoAddPolicy())
self.ssh_client.load_system_host_keys()
self.ssh_client.connect(self.target_host, timeout=self.timeout)

def _simple_ssh_command(self, command):
# Note, that when get_pty is True, paramiko will never return anything
# on the stderr channel, that's why we ignore it here. (stderr output
# will endup in the stdout channel)
_, stdout, _ = self.ssh_client.exec_command(command,
timeout=self.timeout,
get_pty=True)
out_lines = stdout.readlines()
rc = stdout.channel.recv_exit_status()
return [rc, [line.strip() for line in out_lines]]
class Namespace(object):
def __init__(self, host, name):
self.host = host
self.name = name

def _namespace_exists(self, namespace):
rc, out_lines = self._simple_ssh_command(self.netns_list)
return namespace in out_lines
def exists(self):
rc, out_lines = self.host.run('ip netns list')
return self.name in out_lines

def _get_namespace_pids(self, namespace):
rc, out_lines = self._simple_ssh_command(self.netns_pids + namespace)
def get_pids(self):
rc, out_lines = self.host.run('ip netns pids ' + self.name)
if rc:
if out_lines and "No such file or directory" in out_lines[0]:
# Assume the namespace was delete meanwhile
return []
else:
raise RuntimeError("Failed to get pids for namespace %s",
namespace)
self.name)
else:
return out_lines

def _kill_pids_in_namespace(self, namespace):
def kill_pids(self):
LOG.debug("Trying to terminate all processes namespace "
"%s on host %s", namespace, self.target_host)
"%s on host %s", self.name, self.host)
remaining = 3
pids = self._get_namespace_pids(namespace)
pids = self.get_pids()
while pids:
LOG.debug("Processes still running: [%s]", ", ".join(pids))
for pid in pids:
Expand All @@ -1130,45 +1108,76 @@ def _kill_pids_in_namespace(self, namespace):
LOG.debug("Last try. Using SIGKILL now")
killcmd = "kill -9 "

rc, out_lines = self._simple_ssh_command(killcmd + pid)
rc, out_lines = self.host.run(killcmd + pid)
if rc:
if out_lines and "No such process" in out_lines[0]:
# Assume the process was stopped meanwhile
return None
else:
raise RuntimeError("Failed to kill %s on host %s",
pid, self.target_host)
pid, self.host)

remaining -= 1
if remaining:
pids = self._get_namespace_pids(namespace)
pids = self.get_pids()
if pids:
LOG.debug("Some processes are still running in namespace "
"%s on host %s. Retrying.", namespace,
self.target_host)
"%s on host %s. Retrying.", self.name,
self.host)
time.sleep(1)
else:
break

def delete_remote_namespace(self, namespace):
LOG.debug("Deleting namespace %s on host %s.", namespace,
self.target_host)
self._ssh_connect()
with self.ssh_client:
try:
if self._namespace_exists(namespace):
self._kill_pids_in_namespace(namespace)
self._simple_ssh_command(self.netns_del + namespace)
except socket.timeout:
LOG.warn("SSH timeout exceeded. Failed to delete namespace "
"%s on %s", namespace, self.target_host)
def destroy(self):
LOG.info("Destroying namespace %s on host %s.", self.name,
self.host)
try:
if self.exists():
self.kill_pids()
self.host.run('ip netns delete ' + self.name)
except socket.timeout:
LOG.warn("SSH timeout exceeded. Failed to delete namespace "
"%s on %s", self.name, self.host)


def destroy_router_namespace(hostname, router_id):
namespace = "qrouter-" + router_id
with connect_to_host(hostname, 10) as host:
host.run_timeout = 10
Namespace(host, namespace).destroy()


class RemoteRouterNsCleanup(RemoteNodeCleanup):
class SSHHost(object):
def __init__(self, ssh_client, hostname):
self._ssh_client = ssh_client
self.hostname = hostname
self.run_timeout = None

def delete_router_namespace(self, router_id):
namespace = "qrouter-" + router_id
self.delete_remote_namespace(namespace)
def run(self, command, timeout=None):
# Note, that when get_pty is True, paramiko will never return anything
# on the stderr channel, that's why we ignore it here. (stderr output
# will endup in the stdout channel)
timeout = timeout or self.run_timeout
_, stdout, _ = self._ssh_client.exec_command(command,
timeout=timeout,
get_pty=True)
out_lines = stdout.readlines()
rc = stdout.channel.recv_exit_status()
return [rc, [line.strip() for line in out_lines]]

def __str__(self):
return self.hostname


@contextlib.contextmanager
def connect_to_host(hostname, connect_timeout):
with paramiko.SSHClient() as ssh_client:
ssh_client.set_missing_host_key_policy(
paramiko.AutoAddPolicy()
)
ssh_client.load_system_host_keys()
ssh_client.connect(hostname, timeout=connect_timeout)
yield SSHHost(ssh_client, hostname)


def term_signal_handler(signum, frame):
Expand Down
32 changes: 14 additions & 18 deletions files/default/test-neutron-evacuate-lbaasv2-agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,40 +47,36 @@ def test_reassing_single_lb_returns_one_agent(self):
@mock.patch('neutron-evacuate-lbaasv2-agent.'
'EvacuateLbaasV2Agent.loadbalancers_on_agent')
@mock.patch('neutron-evacuate-lbaasv2-agent.'
'RemoteLbaasV2Cleanup')
def test_restarts_agents_using_crm_on_ha(self, mock_cleanup, mock_lbaas):
'restart_lbaasv2_agent_crm')
@mock.patch('neutron-evacuate-lbaasv2-agent.'
'restart_lbaasv2_agent_systemd')
def test_restarts_agents_using_crm_on_ha(self, systemd_cleanup,
crm_cleanup, mock_lbaas):
mock_lbaas.return_value = ['lb1']
evacuate_lbaas.cfg.CONF.set_override("use_crm", True)

self.evacuate_lbaas.run()
self.assertEqual(
mock_cleanup.return_value.restart_lbaasv2_agent_crm.call_count,
2
)
self.assertEqual(
mock_cleanup.return_value.restart_lbaasv2_agent_systemd.call_count,
0
)
self.assertEqual(crm_cleanup.call_count, 2)
self.assertEqual(systemd_cleanup.call_count, 0)

@mock.patch('neutron-evacuate-lbaasv2-agent.'
'EvacuateLbaasV2Agent.loadbalancers_on_agent')
@mock.patch('neutron-evacuate-lbaasv2-agent.'
'RemoteLbaasV2Cleanup')
'restart_lbaasv2_agent_crm')
@mock.patch('neutron-evacuate-lbaasv2-agent.'
'restart_lbaasv2_agent_systemd')
def test_restarts_agents_using_systemd_no_ha(self,
mock_cleanup,
systemd_restart,
crm_restart,
mock_lbaas):
mock_lbaas.return_value = ['lb1']
evacuate_lbaas.cfg.CONF.set_override("use_crm", False)
self.evacuate_lbaas.run()
self.assertEqual(
mock_cleanup.return_value.restart_lbaasv2_agent_crm.call_count,
crm_restart.call_count,
0
)
self.assertEqual(
mock_cleanup.return_value.restart_lbaasv2_agent_systemd.call_count,
systemd_restart.call_count,
2
)


if __name__ == "__main__":
unittest.main()
Loading