Change in vdsm[master]: Live Merge: fix race between libvirt event and vm stats - vdsm-patches - Fedora mailing-lists

26 Jun 2014

Adam Litke has uploaded a new change for review.
Change subject: Live Merge: fix race between libvirt event and vm stats
......................................................................
Live Merge: fix race between libvirt event and vm stats
It's possible for a block job to stop getting reported by libvirt for a
short period before the completion event is actually emitted.  In this
case, vdsm might clean up a block job too early and then be unable to
handle the libvirt completion event.  To fix it, handle the following
two cases:
1) In merge() between trackBlockJob and libvirt call:
We create a placeholder job in vm.conf['_blockJobs'] before we start the
libvirt job.  Take a lock to ensure that queryBlockJobs() doesn't ask
libvirt about this job and find it missing before it could be started.
2) In queryBlockJobs():
If a job is gone according to libvirt, just mark it 'done' so we will
avoid polling on it in the future but let _onBlockJobEvent call
untrackBlockJob() only after the cleanup work is completed.  Continue
reporting the job so engine will wait for us to complete the
synchronization later.  If we miss an event from libvirt (because vdsm
was stopped), the situation will be handled by vm recovery.
Change-Id: Ic0314a39428419bb21bc579f07a92227d5a7acff
Signed-off-by: Adam Litke alitke@redhat.com
---
M vdsm/virt/vm.py
1 file changed, 46 insertions(+), 45 deletions(-)
git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/09/29309/1

diff --git a/vdsm/virt/vm.py b/vdsm/virt/vm.py
index 84370be..2deed52 100644
--- a/vdsm/virt/vm.py
+++ b/vdsm/virt/vm.py
@@ -1784,11 +1784,6 @@
                      (SMARTCARD_DEVICES, SmartCardDevice),
                      (TPM_DEVICES, TpmDevice))
-    BlockJobTypeMap = {libvirt.VIR_DOMAIN_BLOCK_JOB_TYPE_UNKNOWN: 'unknown',
-                       libvirt.VIR_DOMAIN_BLOCK_JOB_TYPE_PULL: 'pull',
-                       libvirt.VIR_DOMAIN_BLOCK_JOB_TYPE_COPY: 'copy',
-                       libvirt.VIR_DOMAIN_BLOCK_JOB_TYPE_COMMIT: 'commit'}
-
     def _makeDeviceDict(self):
         return dict((dev, []) for dev, _ in self.DeviceMapping)
@@ -5432,7 +5427,7 @@
             except LookupError:
                 newJob = {'jobID': jobID, 'disk': driveSpec,
                           'baseVolume': base, 'topVolume': top,
-                          'strategy': strategy,
+                          'strategy': strategy, 'blockJobType': 'commit',
                           'chain': self._driveGetActualVolumeChain(drive)}
                 self.conf['_blockJobs'][jobID] = newJob
             else:
@@ -5455,29 +5450,33 @@
def queryBlockJobs(self):
         jobs = {}
-        for jobID, job in self.conf['_blockJobs'].iteritems():
-            drive = self._findDriveByUUIDs(job['disk'])
-            ret = self._dom.blockJobInfo(drive.name, 0)
-            if not ret:
-                self.log.debug("Block Job for vm:%s, img:%s has ended",
-                               self.conf['vmId'], job['disk']['imageID'])
-                jobs[jobID] = None
-                continue
-
-            jobs[jobID] = {'id': jobID, 'jobType': 'block',
-                           'blockJobType': Vm.BlockJobTypeMap[ret['type']],
-                           'bandwidth': ret['bandwidth'],
-                           'cur': str(ret['cur']), 'end': str(ret['end']),
-                           'imgUUID': job['disk']['imageID']}
-
-        # This function is meant to be called from multiple threads (ie.
-        # VMStatsThread and API calls.  The _jobsLock ensures that a cohesive
-        # data set is returned by serializing each call.
+        # We need to take the jobs lock here to ensure that we don't race with
+        # another call to merge() where the job has been recorded but not yet
+        # started.
         with self._jobsLock:
-            for jobID in jobs.keys():
-                if jobs[jobID] is None:
-                    self.untrackBlockJob(jobID)
-                    del jobs[jobID]
+            for job in self.conf['_blockJobs'].values()[:]:
+                jobID = job['jobID']
+                drive = self._findDriveByUUIDs(job['disk'])
+
+                ret = None
+                if 'done' not in job:
+                    ret = self._dom.blockJobInfo(drive.name, 0)
+                    if not ret:
+                        self.log.debug("Block Job for vm:%s, img:%s has ended",
+                                       self.conf['vmId'],
+                                       job['disk']['imageID'])
+                        job['done'] = True
+
+                entry = {'id': jobID, 'jobType': 'block',
+                         'blockJobType': job['blockJobType'],
+                         'bandwidth': 0, 'cur': '0', 'end': '0',
+                         'imgUUID': job['disk']['imageID']}
+
+                if ret:
+                    entry['bandwidth'] = ret['bandwidth']
+                    entry['cur'] = str(ret['cur'])
+                    entry['end'] = str(ret['end'])
+                jobs[jobID] = entry
         return jobs
def merge(self, driveSpec, baseVolUUID, topVolUUID, bandwidth, jobUUID):
@@ -5543,24 +5542,26 @@
                                "missing VIR_DOMAIN_BLOCK_COMMIT_ACTIVE.")
                 return errCode['mergeErr']
-        try:
-            self.trackBlockJob(jobUUID, drive, baseVolUUID, topVolUUID,
-                               'commit')
-        except BlockJobExistsError:
-            self.log.error("Another block job is already active on this disk")
-            return errCode['mergeErr']
-        self.log.debug("Starting merge with jobUUID='%s'", jobUUID)
+        # Take the jobs lock here to protect the new job we are tracking from
+        # being cleaned up by queryBlockJobs() since it won't exist right away
+        with self._jobsLock:
+            try:
+                self.trackBlockJob(jobUUID, drive, baseVolUUID, topVolUUID,
+                                   'commit')
+            except BlockJobExistsError:
+                self.log.error("A block job is already active on this disk")
+                return errCode['mergeErr']
+            self.log.debug("Starting merge with jobUUID='%s'", jobUUID)
-        try:
-            ret = self._dom.blockCommit(drive.path, base, top, bandwidth,
-                                        flags)
-            if ret != 0:
-                raise RuntimeError("blockCommit operation failed rc:%i", ret)
-        except (RuntimeError, libvirt.libvirtError):
-            self.log.error("Live merge failed for '%s'", drive.path,
-                           exc_info=True)
-            self.untrackBlockJob(jobUUID)
-            return errCode['mergeErr']
+            try:
+                ret = self._dom.blockCommit(drive.path, base, top, bandwidth,
+                                            flags)
+                if ret != 0:
+                    raise RuntimeError("blockCommit failed rc:%i", ret)
+            except (RuntimeError, libvirt.libvirtError):
+                self.log.error("Live merge failed for '%s'", drive.path)
+                self.untrackBlockJob(jobUUID)
+                return errCode['mergeErr']
# blockCommit will cause data to be written into the base volume.
         # Perform an initial extension to ensure there is enough space to
-- 
To view, visit http://gerrit.ovirt.org/29309
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic0314a39428419bb21bc579f07a92227d5a7acff
Gerrit-PatchSet: 1
Gerrit-Project: vdsm
Gerrit-Branch: master
Gerrit-Owner: Adam Litke alitke@redhat.com