Nir Soffer has uploaded a new change for review.
Change subject: resourceManager: Keep resource state if registerResource fails
......................................................................
resourceManager: Keep resource state if registerResource fails
Previous code was increasing resource activeUsers counter, but
exceptions raised after that caused the method to fail, leaving a locked
resources that nobody can release. Such locked resource may lead to
failure of any pool operation, making the host non-operational, and
requiring a restart of vdsm.
The failure in the field was caused by Python logging bug, raising
OSError when message was logged when log file was rotated. However, such
failure can happen everywhere, and locking code must be written in such
way that failure would never leave a resource locked.
This patch ensure that resource is added and activeUsers counter is
increased only if everything else was fine.
Since simulating logging error is hard, the tests monkeypatch the
RequestRef class to simulate a failure. This is little ugly, depending
on internal implementation detail, but I could not find a cleaner way.
Change-Id: I16abf41ebc8a8a99b292d38c945074752254a34b
Relates-To:
https://bugzilla.redhat.com/1065650
Signed-off-by: Nir Soffer <nsoffer(a)redhat.com>
---
M tests/resourceManagerTests.py
M vdsm/storage/resourceManager.py
2 files changed, 50 insertions(+), 9 deletions(-)
git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/84/25284/1
diff --git a/tests/resourceManagerTests.py b/tests/resourceManagerTests.py
index 01b0669..e2b6461 100644
--- a/tests/resourceManagerTests.py
+++ b/tests/resourceManagerTests.py
@@ -29,6 +29,7 @@
import storage.resourceManager as resourceManager
from testrunner import VdsmTestCase as TestCaseBase
from testValidation import slowtest, stresstest
+import monkeypatch
class NullResourceFactory(resourceManager.SimpleResourceFactory):
@@ -209,6 +210,32 @@
ex.__class__.__name__)
self.fail("Managed to access an attribute not exposed by wrapper")
+
+ def testRegisterResourceFailureExclusive(self):
+ # This regeisterion must fail
+ with monkeypatch.MonkeyPatchScope(
+ [(resourceManager, 'RequestRef', FakeRequestRef)]):
+ self.assertRaises(Failure, self.manager.registerResource,
"string",
+ "test", resourceManager.LockType.exclusive,
None)
+
+ # And it should not leave a locked resource
+ with self.manager.acquireResource("string", "test",
+ resourceManager.LockType.exclusive,
+ 0):
+ pass
+
+ def testRegisterResourceFailureShared(self):
+ # This regeisterion must fail
+ with monkeypatch.MonkeyPatchScope(
+ [(resourceManager, 'RequestRef', FakeRequestRef)]):
+ self.assertRaises(Failure, self.manager.registerResource,
"string",
+ "test", resourceManager.LockType.shared, None)
+
+ # And it should not leave a locked resource
+ with self.manager.acquireResource("string", "test",
+ resourceManager.LockType.exclusive,
+ 0):
+ pass
def testAccessAttributeNotExposedByRequestRef(self):
resources = []
@@ -705,3 +732,14 @@
except:
resourceManager.ResourceManager._instance = None
raise
+
+# Helpers
+
+
+class Failure(Exception):
+ """ Unique exception for testing """
+
+
+def FakeRequestRef(*a, **kw):
+ """ Used to simulate failures when registering a resource
"""
+ raise Failure()
diff --git a/vdsm/storage/resourceManager.py b/vdsm/storage/resourceManager.py
index 1be1450..ce144cf 100644
--- a/vdsm/storage/resourceManager.py
+++ b/vdsm/storage/resourceManager.py
@@ -559,23 +559,25 @@
if len(resource.queue) == 0 and \
resource.currentLock == LockType.shared and \
request.lockType == LockType.shared:
- resource.activeUsers += 1
self._log.debug("Resource '%s' found in shared state
"
"and queue is empty, Joining current "
"shared lock (%d active users)",
- fullName, resource.activeUsers)
+ fullName, resource.activeUsers + 1)
request.grant()
+ ref = RequestRef(request)
contextCleanup.defer(request.emit,
ResourceRef(namespace, name,
resource.realObj,
request.reqID))
- return RequestRef(request)
+ resource.activeUsers += 1
+ return ref
- resource.queue.insert(0, request)
self._log.debug("Resource '%s' is currently locked,
"
"Entering queue (%d in queue)",
- fullName, len(resource.queue))
- return RequestRef(request)
+ fullName, len(resource.queue) + 1)
+ ref = RequestRef(request)
+ resource.queue.insert(0, request)
+ return ref
# TODO : Creating the object inside the namespace lock causes
# the entire namespace to lock and might cause
@@ -592,19 +594,20 @@
contextCleanup.defer(request.cancel)
return RequestRef(request)
- resource = resources[name] = ResourceManager.ResourceInfo(
- obj, namespace, name)
+ resource = ResourceManager.ResourceInfo(obj, namespace, name)
resource.currentLock = request.lockType
resource.activeUsers += 1
self._log.debug("Resource '%s' is free. Now locking as
'%s' "
"(1 active user)", fullName, request.lockType)
request.grant()
+ ref = RequestRef(request)
contextCleanup.defer(request.emit,
ResourceRef(namespace, name,
resource.realObj,
request.reqID))
- return RequestRef(request)
+ resources[name] = resource
+ return ref
def releaseResource(self, namespace, name):
# WARN : unlike in resource acquire the user now has the request
--
To view, visit
http://gerrit.ovirt.org/25284
To unsubscribe, visit
http://gerrit.ovirt.org/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I16abf41ebc8a8a99b292d38c945074752254a34b
Gerrit-PatchSet: 1
Gerrit-Project: vdsm
Gerrit-Branch: master
Gerrit-Owner: Nir Soffer <nsoffer(a)redhat.com>