Skip to content

Commit 1e55415

Browse files
committed
CA-392674: nbd_client_manager retry connect on nbd device busy
to connect to nbd devices, nbd_client_manager will 1. protect the operation with /var/run/nonpersistent/nbd_client_manager file lock 2. check whether nbd is being used by `nbd-client -check` 3. load nbd kernel module by `modprobe nbd` 4. call `nbd-client` to connect to nbd device However, step 3 will trigger systemd-udevd run asyncly, which would open and lock the same nbd devices, run udev rules, etc. This introduce races with step 4, e.g. both process want to open and lock the nbd device. Note: the file lock in step 1 does NOT resovle the issue here, as it only coordinate multiple nbd_client_manager processes. To fix the issue, - we patch nbd-client to report the device busy from kernel to nbd_client_manager - nbd_client_manager should check nbd-client exit code, and retry on device busy - nbd_client_manager call `udevadm settle` to wait for udevd parsing udev rules Note: checking nbd-client exit code is still necessary in case of racing with others Signed-off-by: Lin Liu <[email protected]>
1 parent 0384a40 commit 1e55415

File tree

2 files changed

+30
-12
lines changed

2 files changed

+30
-12
lines changed

python3/libexec/nbd_client_manager.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
# Don't wait more than 10 minutes for the NBD device
2525
MAX_DEVICE_WAIT_MINUTES = 10
2626

27+
# According to https://github.com/thom311/libnl/blob/main/include/netlink/errno.h#L38
28+
NLE_BUSY = 25
2729

2830
class InvalidNbdDevName(Exception):
2931
"""
@@ -80,7 +82,7 @@ def __exit__(self, *args):
8082
FILE_LOCK = FileLock(path=LOCK_FILE)
8183

8284

83-
def _call(cmd_args, error=True):
85+
def _call(cmd_args, raise_err=True, log_err=True):
8486
"""
8587
[call cmd_args] executes [cmd_args] and returns the exit code.
8688
If [error] and exit code != 0, log and throws a CalledProcessError.
@@ -94,14 +96,16 @@ def _call(cmd_args, error=True):
9496

9597
_, stderr = proc.communicate()
9698

97-
if error and proc.returncode != 0:
98-
LOGGER.error(
99-
"%s exited with code %d: %s", " ".join(cmd_args), proc.returncode, stderr
100-
)
99+
if proc.returncode != 0:
100+
if log_err:
101+
LOGGER.error(
102+
"%s exited with code %d: %s", " ".join(cmd_args), proc.returncode, stderr
103+
)
101104

102-
raise subprocess.CalledProcessError(
103-
returncode=proc.returncode, cmd=cmd_args, output=stderr
104-
)
105+
if raise_err:
106+
raise subprocess.CalledProcessError(
107+
returncode=proc.returncode, cmd=cmd_args, output=stderr
108+
)
105109

106110
return proc.returncode
107111

@@ -116,7 +120,7 @@ def _is_nbd_device_connected(nbd_device):
116120
if not os.path.exists(nbd_device):
117121
raise NbdDeviceNotFound(nbd_device)
118122
cmd = ["nbd-client", "-check", nbd_device]
119-
returncode = _call(cmd, error=False)
123+
returncode = _call(cmd, raise_err=False, log_err=False)
120124
if returncode == 0:
121125
return True
122126
if returncode == 1:
@@ -191,6 +195,8 @@ def connect_nbd(path, exportname):
191195
"""Connects to a free NBD device using nbd-client and returns its path"""
192196
# We should not ask for too many nbds, as we might not have enough memory
193197
_call(["modprobe", "nbd", "nbds_max=24"])
198+
# Wait for systemd-udevd to process the udev rules
199+
_call(["udevadm", "settle", "--timeout=30"])
194200
retries = 0
195201
while True:
196202
try:
@@ -206,7 +212,17 @@ def connect_nbd(path, exportname):
206212
"-name",
207213
exportname,
208214
]
209-
_call(cmd)
215+
ret = _call(cmd, raise_err=False, log_err=True)
216+
if NLE_BUSY == ret:
217+
# Although _find_unused_nbd_device tell us the nbd devcie is
218+
# not connected by other nbd-client, it may be opened and locked
219+
# by other process like systemd-udev, raise NbdDeviceNotFound to retry
220+
LOGGER.warning("Device %s is busy, will retry", nbd_device)
221+
raise NbdDeviceNotFound(nbd_device)
222+
223+
if 0 != ret:
224+
raise subprocess.CalledProcessError(returncode=ret, cmd=cmd)
225+
210226
_wait_for_nbd_device(nbd_device=nbd_device, connected=True)
211227
_persist_connect_info(nbd_device, path, exportname)
212228
nbd = (

python3/tests/test_nbd_client_manager.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ def test_nbd_device_connected(self, mock_call, mock_exists):
4343
result = nbd_client_manager._is_nbd_device_connected('/dev/nbd0')
4444

4545
self.assertTrue(result)
46-
mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd0"], error=False)
46+
mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd0"],
47+
raise_err=False, log_err=False)
4748

4849
@patch('nbd_client_manager._call')
4950
def test_nbd_device_not_connected(self, mock_call, mock_exists):
@@ -53,7 +54,8 @@ def test_nbd_device_not_connected(self, mock_call, mock_exists):
5354
result = nbd_client_manager._is_nbd_device_connected('/dev/nbd1')
5455

5556
self.assertFalse(result)
56-
mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd1"], error=False)
57+
mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd1"],
58+
raise_err=False, log_err=False)
5759

5860
def test_nbd_device_not_found(self, mock_exists):
5961
mock_exists.return_value = False

0 commit comments

Comments
 (0)