Saggi Mizrahi has uploaded a new change for review.
Change subject: Fix problems with current implementation of forceIscsiRescan ......................................................................
Fix problems with current implementation of forceIscsiRescan
- Move configuration access out to HSM. - Fix error in logging when reading a bad configuration file. - Clean up slow child processes so we don't have a zombie leak. - Fix performance degradation (minTimeout) for fast HBAs.
Change-Id: Ic4e7173086ba15c7706206c5ee1473ed6d334f9e Signed-off-by: Saggi Mizrahi smizrahi@redhat.com --- M tests/Makefile.am A tests/iscsiTests.py M vdsm/config.py.in M vdsm/storage/hsm.py M vdsm/storage/iscsi.py M vdsm/storage/sdc.py 6 files changed, 67 insertions(+), 37 deletions(-)
git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/72/8172/1
diff --git a/tests/Makefile.am b/tests/Makefile.am index 52dfda0..98e9112 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -32,6 +32,7 @@ guestIFTests.py \ hooksTests.py \ lsblkTests.py \ + iscsiTests.py \ main.py \ md_utils_tests.py \ miscTests.py \ diff --git a/tests/iscsiTests.py b/tests/iscsiTests.py new file mode 100644 index 0000000..63138fe --- /dev/null +++ b/tests/iscsiTests.py @@ -0,0 +1,14 @@ +from storage import iscsi +from testrunner import VdsmTestCase as TestCaseBase + + +class IscsiForceRescanTests(TestCaseBase): + def testThatRuns(self): + #TODO: It's quite complex actually checking that it did the right + # thing. For now just make sure that it runs. + iscsi.forceIScsiScan(30) + + def testTimeoutFlow(self): + #TODO: It's quite complex actually checking that it did the right + # thing. For now just make sure that it runs. + iscsi.forceIScsiScan(0) diff --git a/vdsm/config.py.in b/vdsm/config.py.in index df85e7e..5dbede3 100644 --- a/vdsm/config.py.in +++ b/vdsm/config.py.in @@ -176,11 +176,8 @@ 'Storage domain validate timeout, the maximum number of seconds ' 'to wait until all the domains will be validated.'),
- ('scsi_rescan_minimal_timeout', '2', - 'The minimum number of seconds to wait for scsi scan to return.'), - - ('scsi_rescan_maximal_timeout', '30', - 'The maximal number of seconds to wait for scsi scan to return.'), + ('scsi_rescan_timeout', '30', + 'The number of seconds to wait for scsi scan to return.'),
('sd_health_check_delay', '10', 'Storage domain health check delay, the amount of seconds to ' diff --git a/vdsm/storage/hsm.py b/vdsm/storage/hsm.py index a89274a..b54ee18 100644 --- a/vdsm/storage/hsm.py +++ b/vdsm/storage/hsm.py @@ -293,6 +293,13 @@
def storageRefresh(): lvm._lvminfo.bootstrap() + iscsiRescanTimeout = config.getint('irs', 'scsi_rescan_timeout') + if (iscsiRescanTimeout < 0): + self.log.warning("scsi_rescan_timeout as an invalid value, " + "using the default: %d", + sdCache.iscsiRescanTimeout) + + sdCache.iscsiRescanTimeout = iscsiRescanTimeout sdCache.refreshStorage()
self.tasksDir = config.get('irs', 'hsm_tasks') diff --git a/vdsm/storage/iscsi.py b/vdsm/storage/iscsi.py index 578b700..4e8f25a 100644 --- a/vdsm/storage/iscsi.py +++ b/vdsm/storage/iscsi.py @@ -29,10 +29,11 @@ import errno import time from collections import namedtuple +import select +import threading
from vdsm import constants import misc -from vdsm.config import config import devicemapper from threading import RLock
@@ -349,43 +350,52 @@ except iscsiadm.IscsiError: pass
+# FIXME: Because of sampling method semantics, the first caller in each round +# will select the timeout. This doesn't matter as currently the timeout never +# changes. This shows a fundamental problem with this interface. Changing this +# to something more generic will take to much work and is not necessary at the +# moment. @misc.samplingmethod -def forceIScsiScan(): - processes = [] - minTimeout = config.getint('irs', 'scsi_rescan_minimal_timeout') - maxTimeout = config.getint('irs', 'scsi_rescan_maximal_timeout') +def forceIScsiScan(timeout): + poller = select.poll() + procs = {} + for hba in glob.glob(SCAN_PATTERN): cmd = [constants.EXT_DD, 'of=' + hba] p = misc.execCmd(cmd, sudo=False, sync=False) p.stdin.write("- - -") p.stdin.flush() p.stdin.close() - processes.append((hba, p)) - if (minTimeout > maxTimeout or minTimeout < 0): - minTimeout = 2 - maxTimeout = 30 - log.warning("One of the following configuration arguments has an ", - "illegal value: scsi_rescan_minimal_timeout or ", - "scsi_rescan_maximal_timeout. Set to %s and %s seconds ", - "respectively.", minTimeout, maxTimeout) - log.debug("Performing SCSI scan, this will take up to %s seconds", - maxTimeout) - time.sleep(minTimeout) - for i in xrange(maxTimeout - minTimeout): - for p in processes[:]: - (hba, proc) = p - if proc.wait(0): - if proc.returncode != 0: - log.warning('returncode for: %s is: %s', hba, - proc.returncode) - processes.remove(p) - if not processes: - break - else: - time.sleep(1) - else: - log.warning("Still waiting for scsi scan of hbas: %s", - tuple(hba for p in processes)) + fd = p.stdout.fileno() + poller.register(fd, 0) # Only HUP and ERR + procs[fd] = p + + startTime = time.time() + timeLeft = timeout + while (len(procs) > 0) and (timeLeft > 0): + for fd, event in misc.NoIntrPoll(poller.poll, timeout): + poller.unregister(fd) + p = procs.pop(fd) + p.wait() + + timeLeft = timeout - (time.time() - startTime) + + if len(procs) == 0: + return + + log.warn("Not all rescan completed in alotted time") + + # We have to put on a thread that waits for the remaining processes, + # otherwise we will end up with zombie children + + def collectSlowProcs(procs): + for p in procs.itervalues(): + p.wait() + + t = threading.Thread(target=collectSlowProcs, args=(procs,)) + t.setDaemon(True) + t.start() +
def devIsiSCSI(dev): hostdir = os.path.realpath(os.path.join("/sys/block", dev, "device/../../..")) diff --git a/vdsm/storage/sdc.py b/vdsm/storage/sdc.py index 90ccc35..bc73291 100644 --- a/vdsm/storage/sdc.py +++ b/vdsm/storage/sdc.py @@ -64,6 +64,7 @@ self.__domainCache = {} self.storage_repo = storage_repo self.storageStale = True + self.iscsiRescanTimeout = 30
def invalidateStorage(self): self.storageStale = True @@ -71,7 +72,7 @@
@misc.samplingmethod def refreshStorage(self): - multipath.rescan() + multipath.rescan(self.iscsiRescanTimeout) lvm.invalidateCache() self.storageStale = False
-- To view, visit http://gerrit.ovirt.org/8172 To unsubscribe, visit http://gerrit.ovirt.org/settings
Gerrit-MessageType: newchange Gerrit-Change-Id: Ic4e7173086ba15c7706206c5ee1473ed6d334f9e Gerrit-PatchSet: 1 Gerrit-Project: vdsm Gerrit-Branch: master Gerrit-Owner: Saggi Mizrahi smizrahi@redhat.com