modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/AvailabilityExecutor.java | 2 modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/RuntimeDiscoveryExecutor.java | 71 ++++++---- 2 files changed, 48 insertions(+), 25 deletions(-)
New commits: commit b2226a282fbaa992775f5fa11cb138254ba6f23f Author: Jay Shaughnessy jshaughn@redhat.com Date: Tue May 29 12:13:52 2012 -0400
[Bug 826047 - Slow availability check can hang discovery] - Now uses proxy and applies configured avail timeout to getAvailability() calls. - Also, now only perform an avail check on server resources or if the current avail is NOT UP. (i.e. assume UP services are still UP, which eliminates a lot of avail checking for the standard case)
diff --git a/modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/AvailabilityExecutor.java b/modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/AvailabilityExecutor.java index 44a388d..b4d7106 100644 --- a/modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/AvailabilityExecutor.java +++ b/modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/AvailabilityExecutor.java @@ -67,7 +67,7 @@ public class AvailabilityExecutor implements Runnable, Callable<AvailabilityRepo // want to encourage people from changing this, we do not expose this "backdoor" system property as a // standard plugin configuration setting/agent preference - if someone wants to do this, they must // explicitly pass in -D to the JVM running the plugin container. - private static final int GET_AVAILABILITY_TIMEOUT; + static final int GET_AVAILABILITY_TIMEOUT; private static final Random RANDOM = new Random(); static { int timeout; diff --git a/modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/RuntimeDiscoveryExecutor.java b/modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/RuntimeDiscoveryExecutor.java index f69bdfb..978083a 100644 --- a/modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/RuntimeDiscoveryExecutor.java +++ b/modules/core/plugin-container/src/main/java/org/rhq/core/pc/inventory/RuntimeDiscoveryExecutor.java @@ -36,15 +36,18 @@ import org.jetbrains.annotations.NotNull;
import org.rhq.core.clientapi.agent.PluginContainerException; import org.rhq.core.clientapi.server.discovery.InventoryReport; +import org.rhq.core.domain.measurement.Availability; import org.rhq.core.domain.measurement.AvailabilityType; import org.rhq.core.domain.resource.InventoryStatus; import org.rhq.core.domain.resource.Resource; +import org.rhq.core.domain.resource.ResourceCategory; import org.rhq.core.domain.resource.ResourceType; import org.rhq.core.pc.PluginContainer; import org.rhq.core.pc.PluginContainerConfiguration; import org.rhq.core.pc.plugin.PluginComponentFactory; +import org.rhq.core.pc.util.FacetLockType; +import org.rhq.core.pluginapi.availability.AvailabilityFacet; import org.rhq.core.pluginapi.inventory.ProcessScanResult; -import org.rhq.core.pluginapi.inventory.ResourceComponent; import org.rhq.core.pluginapi.inventory.ResourceDiscoveryComponent; import org.rhq.core.util.exception.ExceptionPackage; import org.rhq.core.util.exception.Severity; @@ -184,14 +187,6 @@ public class RuntimeDiscoveryExecutor implements Runnable, Callable<InventoryRep return; }
- ResourceComponent parentComponent = parentContainer.getResourceComponent(); - if (parentComponent == null) { - if (log.isDebugEnabled()) { - log.debug("Parent component for [" + parent + "] was null; cannot perform service scan."); - } - return; - } - // For each child resource type of the server, do a discovery for resources of that type Set<ResourceType> childResourceTypes = parent.getResourceType().getChildResourceTypes(); if (null == childResourceTypes || childResourceTypes.isEmpty()) { @@ -204,23 +199,51 @@ public class RuntimeDiscoveryExecutor implements Runnable, Callable<InventoryRep } }
- // Do a live check of availability here. This won't set the availability anywhere but will allow us - // to find nested resources, i.e. children of resources we've found during our recursive call - // to discoverForResource(). Without this live check, the availability of these newly discovered - // resources would be null, so we would just return without checking for their children. Also, we don't - // want to do discovery for a NOT UP parent. - AvailabilityType availability; - ClassLoader originalCL = Thread.currentThread().getContextClassLoader(); - try { - Thread.currentThread().setContextClassLoader(parentContainer.getResourceClassLoader()); - availability = parentComponent.getAvailability(); - } catch (Exception e) { - availability = AvailabilityType.DOWN; - } finally { - Thread.currentThread().setContextClassLoader(originalCL); + // At this point we used to always do a live check of availability. This buys us very little and + // costs us a lot. For a platform-rooted scan this ends up being an avail check for all but leaf + // nodes of the tree. That is costly on top of the discovery check itself, and is antithetical to + // the whole staggered avail-check approach we now have in place. We can't even update the container + // with the avail check result, because all changes in avail need to be detected and reported by the + // AvailabilityExecutor. We did the avail check for two reasons. If there is no current avail we + // need to establish one because it may be for a resource newly discovered by this scan, and we need to know + // if we can in turn perform discovery on it. We still need to do this check. The second was to perform + // discovery only on UP resources. We can keep this logic but just use the current availability + // stored in the container. It may be stale, but it is likely valid, as avail does not often change. + // An argument could be made to always use the currently stored avail, but currently we've decided + // to still perform the check in two cases: if the current avail is not UP or if the resource category is + // SERVER. This means we won't miss an opportunity to do discovery for stale DOWN resource, and we won't + // waste time doing discovery on a stale UP SERVER, which can be time consuming. Since most resources are + // SERVICEs, and also are typically UP and stay UP, perfoming checks in these two situations should + // not add much overhead. Finally, make sure to use facet proxy to do the avail check, this allows us to use + // a timeout, and therefore not hang discovery if the avail check is slow. + Availability currentAvailability = parentContainer.getAvailability(); + AvailabilityType currentAvailabilityType = (null == currentAvailability) ? AvailabilityType.DOWN + : currentAvailability.getAvailabilityType(); + + // If there is no current avail, or this is a SERVER, we must perfom the live check. + if (AvailabilityType.UP != currentAvailabilityType + || ResourceCategory.SERVER == parentContainer.getResource().getResourceType().getCategory()) { + + AvailabilityFacet parentComponent = null; + try { + parentComponent = parentContainer.createResourceComponentProxy(AvailabilityFacet.class, + FacetLockType.NONE, AvailabilityExecutor.GET_AVAILABILITY_TIMEOUT, true, false); + + } catch (PluginContainerException e) { + if (log.isDebugEnabled()) { + log.debug("Parent component for [" + parent + "] was null; cannot perform service scan."); + } + return; + } + + try { + currentAvailabilityType = parentComponent.getAvailability(); + } catch (Exception e) { + currentAvailabilityType = AvailabilityType.DOWN; + } }
- if (availability != AvailabilityType.UP) { + if (AvailabilityType.UP != currentAvailabilityType) { if (log.isDebugEnabled()) { log.debug("Availability of [" + parent + "] is not UP, cannot perform service scan on it."); }