From 288276144724476f0ed6b716fea8b512f46b68e6 Mon Sep 17 00:00:00 2001
From: Fredrik Thulin <fredrik@thulin.net>
Date: Mon, 26 May 2014 12:25:09 +0200
Subject: sync from eduid-ops

---
 global/overlay/usr/local/bin/scriptherder | 302 +++++++++++++++++++++++-------
 1 file changed, 232 insertions(+), 70 deletions(-)

(limited to 'global/overlay/usr')

diff --git a/global/overlay/usr/local/bin/scriptherder b/global/overlay/usr/local/bin/scriptherder
index c11383a..1e00ec0 100755
--- a/global/overlay/usr/local/bin/scriptherder
+++ b/global/overlay/usr/local/bin/scriptherder
@@ -68,6 +68,28 @@ exit_status = {'OK': 0,
                }
 
 
+class ScriptHerderError(Exception):
+    """
+    Base exception class for scriptherder.
+    """
+
+    def __init__(self, reason, filename):
+        self.reason = reason
+        self.filename = filename
+
+
+class JobLoadError(ScriptHerderError):
+    """
+    Raised when loading a job file fails.
+    """
+
+
+class CheckLoadError(ScriptHerderError):
+    """
+    Raised when loading a check file fails.
+    """
+
+
 class Job(object):
     """
     Representation of an execution of a job.
@@ -109,6 +131,21 @@ class Job(object):
             exit = self.exit_status,
         )
 
+    def status_summary(self):
+        """
+        Return short string with status of job.
+
+        E.g. 'name[exit=0,age=19h]'
+        """
+        if self._end_time is None or self._start_time is None:
+            return '{name}[not_running]'.format(name = self.name)
+        age = _time_to_str(time.time() - self._start_time)
+        return '{name}[exit={exit_status},age={age}]'.format(
+            name = self.name,
+            exit_status = self._exit_status,
+            age = age,
+            )
+
     @property
     def name(self):
         """
@@ -167,11 +204,10 @@ class Job(object):
 
         @rtype: string
         """
+        if self._end_time is None or self._start_time is None:
+            return 'NaN'
         duration = self._end_time - self._start_time
-        if duration < 1:
-            # milliseconds
-            return '{:0.3f}ms'.format(duration * 1000)
-        return '{:0.3f}s'.format(duration)
+        return _time_to_str(duration)
 
     @property
     def exit_status(self):
@@ -326,7 +362,7 @@ class Job(object):
             #self._output_size = data.get('output_size')  # currently not used in scriptherder
             self._filename = filename
         else:
-            raise AssertionError('Unknown version in file {!r}: {!r}'.format(filename, data.get('version')))
+            raise JobLoadError('Unknown version: {!r}'.format(data.get('version')), filename=filename)
         return self
 
 
@@ -355,7 +391,7 @@ class Check(object):
         self.logger = logger
         self.config = ConfigParser.ConfigParser(_check_defaults)
         if not self.config.read([filename]):
-            raise ValueError("Failed loading config file {!r}".format(filename))
+            raise ScriptHerderError('Failed loading config file', filename)
         _section = 'check'
         self._ok_criteria = [x.strip() for x in self.config.get(_section, 'ok').split(',')]
         self._warning_criteria = [x.strip() for x in self.config.get(_section, 'warning').split(',')]
@@ -422,6 +458,130 @@ class Check(object):
         return False
 
 
+class CheckStatus(object):
+    """
+    Aggregated status of job invocations for --mode check.
+
+    Attributes:
+
+      checks_ok: List of checks in OK state ([Job()]).
+      checks_warning: List of checks in WARNING state ([Job()]).
+      checks_critical: List of checks in CRITICAL state ([Job()]).
+    """
+
+    def __init__(self, args, logger):
+        """
+        @param args: Parsed command line arguments
+        @param logger: logging logger
+        """
+
+        self.checks_ok = []
+        self.checks_warning = []
+        self.checks_critical = []
+
+        self._jobs = _get_job_results(args, logger)
+        # group the jobs by their name
+        _by_name = {}
+        for this in self._jobs:
+            if this.name not in _by_name:
+                _by_name[this.name] = []
+            _by_name[this.name].append(this)
+        self._jobs_by_name = _by_name
+
+        self._job_count = len(_by_name)
+
+        self._check_running_jobs(args, logger)
+        if not args.cmd:
+            self._check_not_running(args, logger)
+
+    def _check_running_jobs(self, args, logger):
+        """
+        Look for job execution entrys (parsed into Job() instances), group them
+        per check name and determine the status. For each group, append status
+        to one of the three aggregate status lists of this object (checks_ok,
+        checks_warning or checks_critical).
+
+        @param args: Parsed command line arguments
+        @param logger: logging logger
+        """
+        # determine total check status based on all logged invocations of this job
+        for (name, jobs) in self._jobs_by_name.items():
+            # Load the evaluation criterias for this job
+            check_filename = os.path.join(args.checkdir, name + '.ini')
+            logger.debug("Loading check definition from {!r}".format(check_filename))
+            try:
+                check = Check(check_filename, logger)
+            except ScriptHerderError as exc:
+                logger.warning("Failed loading check: {!r}".format(exc), exc_info=True)
+                raise CheckLoadError('Failed loading check', filename = check_filename)
+
+            # Sort jobs, oldest first
+            jobs = sorted(jobs, key=lambda x: x.start_time)
+            logger.debug("Checking {!r}: {!r}".format(name, jobs))
+
+            jobs_ok = []
+            jobs_warning = []
+            jobs_critical = []
+            for job in jobs:
+                if check.job_is_ok(job):
+                    jobs_ok.append(job)
+                elif check.job_is_warning(job):
+                    jobs_warning.append(job)
+                else:
+                    jobs_critical.append(job)
+
+            logger.debug("Raw status OK      : {!r}".format(jobs_ok))
+            logger.debug("Raw status WARN    : {!r}".format(jobs_warning))
+            logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical))
+
+            # add most recent job status to the totals
+            if jobs_ok:
+                self.checks_ok.append(jobs_ok[-1])
+            elif jobs_warning:
+                self.checks_warning.append(jobs_warning[-1])
+            else:
+                self.checks_critical.append(jobs_critical[-1])
+
+    def _check_not_running(self, args, logger):
+        """
+        Look for job execution entrys (parsed into Job() instances), group them
+        per check name and determine the status. For each group, append status
+        to one of the three aggregate status lists of this object (checks_ok,
+        checks_warning or checks_critical).
+
+        @param args: Parsed command line arguments
+        @param logger: logging logger
+        """
+        files = [f for f in os.listdir(args.checkdir) if os.path.isfile(os.path.join(args.checkdir, f))]
+        for this in files:
+            if not this.endswith('.ini'):
+                continue
+            filename = os.path.join(args.checkdir, this)
+            logger.debug("Loading check definition from {!r}".format(filename))
+            try:
+                # validate check loads
+                Check(filename, logger)
+            except ValueError as exc:
+                logger.warning("Failed loading check: {!r}".format(exc), exc_info=True)
+                raise CheckLoadError(filename = filename)
+            name = this[:-4]  # remove the '.ini' suffix
+            if name not in self._jobs_by_name:
+                logger.debug('Check {!r} (filename {!r}) not found in jobs'.format(name, filename))
+                job = Job(name=name)
+                self.checks_critical.append(job)
+                self._job_count += 1
+            else:
+                logger.debug('Check {!r} has {!r} logged results'.format(name, len(self._jobs_by_name[name])))
+
+    def num_jobs(self):
+        """
+        Return number of jobs processed. This is number of different jobs running + not running.
+
+        @rtype: int
+        """
+        return self._job_count
+
+
 def job_from_file(filename):
     """
     Recreate Job() instance from saved file.
@@ -488,6 +648,7 @@ def parse_args(defaults):
                         )
 
     args = parser.parse_args()
+
     return args
 
 
@@ -537,88 +698,61 @@ def mode_check(args, logger):
     @param args: Parsed command line arguments
     @param logger: logging logger
     """
-    jobs = _get_job_results(args, logger)
 
-    # group the jobs by their name
-    by_name = {}
-    for this in jobs:
-        if this.name not in by_name:
-            by_name[this.name] = []
-        by_name[this.name].append(this)
-
-    total_ok = []
-    total_warning = []
-    total_critical = []
-
-    # determine total check status based on all logged invocations of this job
-    for (name, jobs) in by_name.items():
-        # Sort jobs, oldest first
-        jobs = sorted(jobs, key=lambda x: x.start_time)
-        # Load the evaluation criterias for this job
-        check_filename = os.path.join(args.checkdir, name + '.ini')
-        logger.debug("Loading check definition from {!r}".format(check_filename))
-        check = Check(check_filename, logger)
-        logger.debug("Checking {!r}: {!r}".format(name, jobs))
-
-        jobs_ok = []
-        jobs_warning = []
-        jobs_critical = []
-        for job in jobs:
-            if check.job_is_ok(job):
-                jobs_ok.append(job)
-            elif check.job_is_warning(job):
-                jobs_warning.append(job)
-            else:
-                jobs_critical.append(job)
-        logger.debug("Raw status OK      : {!r}".format(jobs_ok))
-        logger.debug("Raw status WARN    : {!r}".format(jobs_warning))
-        logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical))
-        if jobs_ok:
-            total_ok.append(jobs_ok[-1])
-        elif jobs_warning:
-            total_warning.append(jobs_warning[-1])
-        else:
-            total_critical.append(jobs_critical[-1])
+    try:
+        status = CheckStatus(args, logger)
+    except CheckLoadError as exc:
+        print("UNKNOWN: Failed loading check from file '{!s}' ({!s})".format(exc.filename, exc.reason))
+        return exit_status['UNKNOWN']
 
     if args.cmd:
         # Single job check requested, output detailed information
-        if total_ok:
-            print('OK: {!s}'.format(total_ok[-1]))
+        if status.checks_ok:
+            print('OK: {!s}'.format(status.checks_ok[-1]))
             return exit_status['OK']
-        if total_warning:
-            print('WARNING: {!s}'.format(total_warning[-1]))
+        if status.checks_warning:
+            print('WARNING: {!s}'.format(status.checks_warning[-1]))
             return exit_status['WARNING']
-        if total_critical:
-            print('CRITICAL: {!s}'.format(total_critical[-1]))
+        if status.checks_critical:
+            print('CRITICAL: {!s}'.format(status.checks_critical[-1]))
             return exit_status['CRITICAL']
         print "UNKNOWN - no jobs found for {!r}?".format(args.cmd)
         return exit_status['UNKNOWN']
 
-    # When not looking at multiple jobs at once, logic gets a bit reversed - if ANY
+    # When looking at multiple jobs at once, logic gets a bit reversed - if ANY
     # job invocation is CRITICAL/WARNING, the aggregate message given to
     # Nagios will have to be a failure.
-    if total_critical:
-        print("CRITICAL: {num} job(s) in this state: {names}".format(
-            num = len(total_critical),
-            names = ', '.join([str(x.name) for x in total_critical]),
-            ))
+    if status.checks_critical:
+        print('CRITICAL: {!s}'.format(
+            _status_summary(status.num_jobs(), status.checks_critical)))
         return exit_status['CRITICAL']
-    if total_warning:
-        print("WARNING: {num} job(s) in this state: {names}".format(
-            num = len(total_warning),
-            names = ', '.join([str(x.name) for x in total_warning]),
-            ))
+    if status.checks_warning:
+        print('WARNING: {!s}'.format(
+            _status_summary(status.num_jobs(), status.checks_warning)))
         return exit_status['WARNING']
-    if total_ok:
-        print("OK: {num} job(s) in this state: {names}".format(
-            num = len(total_ok),
-            names = ', '.join([x.name for x in total_ok]),
-            ))
+    if status.checks_ok:
+        print('OK: {!s}'.format(
+            _status_summary(status.num_jobs(), status.checks_ok)))
         return exit_status['OK']
     print "UNKNOWN - no jobs found?"
     return exit_status['UNKNOWN']
 
 
+def _status_summary(num_jobs, failed):
+    """
+    String format routine used in output of checks status.
+    """
+    fmt = '1 job in this state: {summary}'
+    if len(failed) == 1:
+        fmt = '{jobs}/{num_jobs} job in this state: {summary}'
+
+    summary = ', '.join(sorted([str(x.status_summary()) for x in failed]))
+    return fmt.format(jobs = len(failed),
+                      num_jobs = num_jobs,
+                      summary = summary,
+                      )
+
+
 def _get_job_results(args, logger):
     """
     Load all jobs matching any specified name on the command line.
@@ -634,7 +768,10 @@ def _get_job_results(args, logger):
         if not this.endswith('.json'):
             continue
         filename = os.path.join(args.datadir, this)
-        job = job_from_file(filename)
+        try:
+            job = job_from_file(filename)
+        except JobLoadError as exc:
+            logger.warning("Failed loading job file '{!s}' ({!s})".format(exc.filename, exc.reason))
         if args.cmd:
             if args.cmd[0] != job.name:
                 logger.debug("Skipping '{!s}' not matching '{!s}' (file {!s})".format(job.name, args.cmd[0], filename))
@@ -666,6 +803,27 @@ def _parse_time_value(value):
         return num
 
 
+def _time_to_str(value):
+    """
+    Format number of seconds to short readable string.
+
+    @type value: float or int
+
+    @rtype: string
+    """
+    if value < 1:
+        # milliseconds
+        return '{:0.3f}ms'.format(value * 1000)
+    if value < 60:
+        return '{!s}s'.format(int(value))
+    if value < 3600:
+        return '{!s}m'.format(int(value))
+    if value < 86400:
+        return '{!s}h'.format(int(value / 3600))
+    days = int(value / 86400)
+    return '{!s}d{!s}h'.format(days, int((value % 86400) / 3600))
+
+
 def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults):
     """
     Main entry point for either wrapping a script, or checking the status of it.
@@ -699,6 +857,10 @@ def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults
         syslog_h.setFormatter(formatter)
         logger.addHandler(syslog_h)
 
+    if args.name and args.mode != 'wrap':
+        logger.error('Argument --name only applicable for --mode wrap')
+        return False
+
     if args.mode == 'wrap':
         return mode_wrap(args, logger)
     elif args.mode == 'ls':
-- 
cgit v1.1