Repository :
http://git.fedorahosted.org/cgit/copr.git
On branch : master
---------------------------------------------------------------
commit c66e6e28fc58682b24b8a0bbbc6589f2762cac8f
Author: Valentin Gologuzov <vgologuz(a)redhat.com>
Date: Fri Jan 30 10:58:05 2015 +0100
[backend] [rhbz:#1073333] Record consecutive builds fails to redis. Added script to
produce warnings for nagios check from failures recorded to redis.
---------------------------------------------------------------
backend/backend/constants.py | 4 ++
backend/backend/daemons/dispatcher.py | 8 ++++
backend/backend/helpers.py | 40 +++++++++++++++++-
backend/run/check_consecutive_build_fails.py | 43 ++++++++++++++++++++
backend/tests/deamons/test_dispatcher.py | 2 +
.../tests/test_logic/test_builds_logic.py | 6 ++-
6 files changed, 99 insertions(+), 4 deletions(-)
diff --git a/backend/backend/constants.py b/backend/backend/constants.py
index e6e01de..717ff27 100644
--- a/backend/backend/constants.py
+++ b/backend/backend/constants.py
@@ -14,6 +14,10 @@ DEF_MACROS = {}
DEF_BUILDROOT_PKGS = ""
+DEF_CONSECUTIVE_FAILURE_THRESHOLD = 10
+CONSECUTIVE_FAILURE_REDIS_KEY = "copr:sys:consecutive_build_fails"
+
+
class BuildStatus(object):
FAILURE = 0
SUCCEEDED = 1
diff --git a/backend/backend/daemons/dispatcher.py
b/backend/backend/daemons/dispatcher.py
index 33facc0..2cfb086 100644
--- a/backend/backend/daemons/dispatcher.py
+++ b/backend/backend/daemons/dispatcher.py
@@ -18,6 +18,8 @@ from setproctitle import setproctitle
from IPy import IP
from retask.queue import Queue
+
+
from ..mockremote.callback import CliLogCallBack
from ..exceptions import MockRemoteError, CoprWorkerError, CoprWorkerSpawnFailError
@@ -26,6 +28,7 @@ from ..job import BuildJob
from ..mockremote import MockRemote
from ..frontend import FrontendClient
from ..constants import BuildStatus
+from ..helpers import register_build_result
ansible_playbook = "ansible-playbook"
@@ -469,6 +472,8 @@ class Worker(multiprocessing.Process):
raise CoprWorkerError(
"No IP found from creating instance")
except AnsibleError as e:
+ register_build_result(self.opts, failed=True)
+
self.callback.log("failure to setup instance: {0}".format(e))
raise
@@ -600,10 +605,13 @@ class Worker(multiprocessing.Process):
if self.opts.do_sign:
mr.add_pubkey()
+ register_build_result(self.opts)
+
except MockRemoteError as e:
# record and break
self.callback.log("{0} - {1}".format(self.vm_ip, e))
status = BuildStatus.FAILURE
+ register_build_result(self.opts, failed=True)
self.callback.log(
"Finished build: id={0} builder={1} timeout={2} destdir={3}"
diff --git a/backend/backend/helpers.py b/backend/backend/helpers.py
index 62f7ad9..8554c8b 100644
--- a/backend/backend/helpers.py
+++ b/backend/backend/helpers.py
@@ -16,12 +16,20 @@ import datetime
from copr.client import CoprClient
-from backend.constants import DEF_BUILD_USER, DEF_BUILD_TIMEOUT
+from backend.constants import DEF_BUILD_USER, DEF_BUILD_TIMEOUT,
DEF_CONSECUTIVE_FAILURE_THRESHOLD, \
+ CONSECUTIVE_FAILURE_REDIS_KEY
from backend.exceptions import CoprBackendError
+from redis import StrictRedis
+
+try:
+ import fedmsg
+except ImportError:
+ # fedmsg is optional
+ fedmsg = None
-class SortedOptParser(optparse.OptionParser):
+class SortedOptParser(optparse.OptionParser):
"""Optparser which sorts the options by opt before outputting
--help"""
def format_help(self, formatter=None):
@@ -146,8 +154,13 @@ class BackendConfigReader(object):
cp, "backend", "sleeptime", 10, mode="int")
opts.timeout = _get_conf(
cp, "builder", "timeout", DEF_BUILD_TIMEOUT,
mode="int")
+ opts.consecutive_failure_threshold = _get_conf(
+ cp, "builder", "consecutive_failure_threshold",
+ DEF_CONSECUTIVE_FAILURE_THRESHOLD, mode="int")
opts.logfile = _get_conf(
cp, "backend", "logfile",
"/var/log/copr/backend.log")
+ opts.error_logfile = _get_conf(
+ cp, "backend", "error_logfile",
"/var/log/copr/backend_error.log")
opts.verbose = _get_conf(
cp, "backend", "verbose", False, mode="bool")
opts.worker_logdir = _get_conf(
@@ -196,3 +209,26 @@ def log(lf, msg, quiet=None):
"Could not write to logfile {0} - {1}\n".format(lf, str(e)))
if not quiet:
print(msg)
+
+
+def register_build_result(opts=None, failed=False):
+ """
+ Remember fails to redis.
+ Successful build resets counter to zero.
+
+ :param opts: BackendConfig, when opts not provided default config location will be
used
+ :param boolean failed: failure flag
+ :param str origin: name of component produced failure, default: `builder`
+ """
+ if opts is None:
+ opts = BackendConfigReader().read()
+
+ # TODO: add config options to specify redis host, port
+ conn = StrictRedis() # connecting to default local redis instance
+
+ key = CONSECUTIVE_FAILURE_REDIS_KEY
+ if not failed:
+ conn.set(key, 0)
+ else:
+ conn.incr(key)
+
diff --git a/backend/run/check_consecutive_build_fails.py
b/backend/run/check_consecutive_build_fails.py
new file mode 100755
index 0000000..8e8e26a
--- /dev/null
+++ b/backend/run/check_consecutive_build_fails.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python -tt
+# coding: utf-8
+
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
+
+import sys
+
+from redis import StrictRedis
+
+sys.path.append("/usr/share/copr/")
+
+
+from backend.helpers import BackendConfigReader
+from backend.constants import CONSECUTIVE_FAILURE_REDIS_KEY
+
+
+def main():
+ opts = BackendConfigReader().read()
+ conn = StrictRedis() # connecting to default local redis instance
+
+ key = CONSECUTIVE_FAILURE_REDIS_KEY
+
+ value = int(conn.get(key) or 0)
+ if value > opts.consecutive_failure_threshold:
+ print("Critical")
+ sys.exit(2)
+ elif value > int(0.5 * opts.consecutive_failure_threshold):
+ print("Warning")
+ sys.exit(1)
+ else:
+ print("OK")
+ sys.exit(0)
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ except Exception as error:
+ print("UNKNOWN: {}".format(error))
+ sys.exit(3)
diff --git a/backend/tests/deamons/test_dispatcher.py
b/backend/tests/deamons/test_dispatcher.py
index b50f900..c56bbd4 100644
--- a/backend/tests/deamons/test_dispatcher.py
+++ b/backend/tests/deamons/test_dispatcher.py
@@ -92,6 +92,8 @@ class TestDispatcher(object):
timeout=1800,
destdir=self.tmp_dir_path,
results_baseurl="/tmp",
+
+ consecutive_failure_threshold=10,
)
self.job = BuildJob(self.task, self.opts)
diff --git a/frontend/coprs_frontend/tests/test_logic/test_builds_logic.py
b/frontend/coprs_frontend/tests/test_logic/test_builds_logic.py
index 1eeb17a..c7dc07e 100644
--- a/frontend/coprs_frontend/tests/test_logic/test_builds_logic.py
+++ b/frontend/coprs_frontend/tests/test_logic/test_builds_logic.py
@@ -74,7 +74,7 @@ class TestBuildsLogic(CoprsTestCase):
def test_build_queue_4(self, f_users, f_coprs, f_mock_chroots, f_builds, f_db):
for build_chroots in [self.b1_bc, self.b2_bc]:
for build_chroot in build_chroots:
- build_chroot.status = 3
+ build_chroot.status = 3 # running
for build_chroots in [self.b3_bc, self.b4_bc]:
for build_chroot in build_chroots:
build_chroot.status = 0
@@ -85,4 +85,6 @@ class TestBuildsLogic(CoprsTestCase):
self.db.session.commit()
data = BuildsLogic.get_build_task_queue().all()
- assert len(data) == 1 #
+
+ assert len(data) == 1
+ assert data[0] == self.b1_bc[0]