From 288276144724476f0ed6b716fea8b512f46b68e6 Mon Sep 17 00:00:00 2001 From: Fredrik Thulin Date: Mon, 26 May 2014 12:25:09 +0200 Subject: sync from eduid-ops --- global/overlay/usr/local/bin/scriptherder | 302 +++++++++++++++++++++++------- 1 file changed, 232 insertions(+), 70 deletions(-) (limited to 'global/overlay/usr') diff --git a/global/overlay/usr/local/bin/scriptherder b/global/overlay/usr/local/bin/scriptherder index c11383a..1e00ec0 100755 --- a/global/overlay/usr/local/bin/scriptherder +++ b/global/overlay/usr/local/bin/scriptherder @@ -68,6 +68,28 @@ exit_status = {'OK': 0, } +class ScriptHerderError(Exception): + """ + Base exception class for scriptherder. + """ + + def __init__(self, reason, filename): + self.reason = reason + self.filename = filename + + +class JobLoadError(ScriptHerderError): + """ + Raised when loading a job file fails. + """ + + +class CheckLoadError(ScriptHerderError): + """ + Raised when loading a check file fails. + """ + + class Job(object): """ Representation of an execution of a job. @@ -109,6 +131,21 @@ class Job(object): exit = self.exit_status, ) + def status_summary(self): + """ + Return short string with status of job. + + E.g. 'name[exit=0,age=19h]' + """ + if self._end_time is None or self._start_time is None: + return '{name}[not_running]'.format(name = self.name) + age = _time_to_str(time.time() - self._start_time) + return '{name}[exit={exit_status},age={age}]'.format( + name = self.name, + exit_status = self._exit_status, + age = age, + ) + @property def name(self): """ @@ -167,11 +204,10 @@ class Job(object): @rtype: string """ + if self._end_time is None or self._start_time is None: + return 'NaN' duration = self._end_time - self._start_time - if duration < 1: - # milliseconds - return '{:0.3f}ms'.format(duration * 1000) - return '{:0.3f}s'.format(duration) + return _time_to_str(duration) @property def exit_status(self): @@ -326,7 +362,7 @@ class Job(object): #self._output_size = data.get('output_size') # currently not used in scriptherder self._filename = filename else: - raise AssertionError('Unknown version in file {!r}: {!r}'.format(filename, data.get('version'))) + raise JobLoadError('Unknown version: {!r}'.format(data.get('version')), filename=filename) return self @@ -355,7 +391,7 @@ class Check(object): self.logger = logger self.config = ConfigParser.ConfigParser(_check_defaults) if not self.config.read([filename]): - raise ValueError("Failed loading config file {!r}".format(filename)) + raise ScriptHerderError('Failed loading config file', filename) _section = 'check' self._ok_criteria = [x.strip() for x in self.config.get(_section, 'ok').split(',')] self._warning_criteria = [x.strip() for x in self.config.get(_section, 'warning').split(',')] @@ -422,6 +458,130 @@ class Check(object): return False +class CheckStatus(object): + """ + Aggregated status of job invocations for --mode check. + + Attributes: + + checks_ok: List of checks in OK state ([Job()]). + checks_warning: List of checks in WARNING state ([Job()]). + checks_critical: List of checks in CRITICAL state ([Job()]). + """ + + def __init__(self, args, logger): + """ + @param args: Parsed command line arguments + @param logger: logging logger + """ + + self.checks_ok = [] + self.checks_warning = [] + self.checks_critical = [] + + self._jobs = _get_job_results(args, logger) + # group the jobs by their name + _by_name = {} + for this in self._jobs: + if this.name not in _by_name: + _by_name[this.name] = [] + _by_name[this.name].append(this) + self._jobs_by_name = _by_name + + self._job_count = len(_by_name) + + self._check_running_jobs(args, logger) + if not args.cmd: + self._check_not_running(args, logger) + + def _check_running_jobs(self, args, logger): + """ + Look for job execution entrys (parsed into Job() instances), group them + per check name and determine the status. For each group, append status + to one of the three aggregate status lists of this object (checks_ok, + checks_warning or checks_critical). + + @param args: Parsed command line arguments + @param logger: logging logger + """ + # determine total check status based on all logged invocations of this job + for (name, jobs) in self._jobs_by_name.items(): + # Load the evaluation criterias for this job + check_filename = os.path.join(args.checkdir, name + '.ini') + logger.debug("Loading check definition from {!r}".format(check_filename)) + try: + check = Check(check_filename, logger) + except ScriptHerderError as exc: + logger.warning("Failed loading check: {!r}".format(exc), exc_info=True) + raise CheckLoadError('Failed loading check', filename = check_filename) + + # Sort jobs, oldest first + jobs = sorted(jobs, key=lambda x: x.start_time) + logger.debug("Checking {!r}: {!r}".format(name, jobs)) + + jobs_ok = [] + jobs_warning = [] + jobs_critical = [] + for job in jobs: + if check.job_is_ok(job): + jobs_ok.append(job) + elif check.job_is_warning(job): + jobs_warning.append(job) + else: + jobs_critical.append(job) + + logger.debug("Raw status OK : {!r}".format(jobs_ok)) + logger.debug("Raw status WARN : {!r}".format(jobs_warning)) + logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical)) + + # add most recent job status to the totals + if jobs_ok: + self.checks_ok.append(jobs_ok[-1]) + elif jobs_warning: + self.checks_warning.append(jobs_warning[-1]) + else: + self.checks_critical.append(jobs_critical[-1]) + + def _check_not_running(self, args, logger): + """ + Look for job execution entrys (parsed into Job() instances), group them + per check name and determine the status. For each group, append status + to one of the three aggregate status lists of this object (checks_ok, + checks_warning or checks_critical). + + @param args: Parsed command line arguments + @param logger: logging logger + """ + files = [f for f in os.listdir(args.checkdir) if os.path.isfile(os.path.join(args.checkdir, f))] + for this in files: + if not this.endswith('.ini'): + continue + filename = os.path.join(args.checkdir, this) + logger.debug("Loading check definition from {!r}".format(filename)) + try: + # validate check loads + Check(filename, logger) + except ValueError as exc: + logger.warning("Failed loading check: {!r}".format(exc), exc_info=True) + raise CheckLoadError(filename = filename) + name = this[:-4] # remove the '.ini' suffix + if name not in self._jobs_by_name: + logger.debug('Check {!r} (filename {!r}) not found in jobs'.format(name, filename)) + job = Job(name=name) + self.checks_critical.append(job) + self._job_count += 1 + else: + logger.debug('Check {!r} has {!r} logged results'.format(name, len(self._jobs_by_name[name]))) + + def num_jobs(self): + """ + Return number of jobs processed. This is number of different jobs running + not running. + + @rtype: int + """ + return self._job_count + + def job_from_file(filename): """ Recreate Job() instance from saved file. @@ -488,6 +648,7 @@ def parse_args(defaults): ) args = parser.parse_args() + return args @@ -537,88 +698,61 @@ def mode_check(args, logger): @param args: Parsed command line arguments @param logger: logging logger """ - jobs = _get_job_results(args, logger) - # group the jobs by their name - by_name = {} - for this in jobs: - if this.name not in by_name: - by_name[this.name] = [] - by_name[this.name].append(this) - - total_ok = [] - total_warning = [] - total_critical = [] - - # determine total check status based on all logged invocations of this job - for (name, jobs) in by_name.items(): - # Sort jobs, oldest first - jobs = sorted(jobs, key=lambda x: x.start_time) - # Load the evaluation criterias for this job - check_filename = os.path.join(args.checkdir, name + '.ini') - logger.debug("Loading check definition from {!r}".format(check_filename)) - check = Check(check_filename, logger) - logger.debug("Checking {!r}: {!r}".format(name, jobs)) - - jobs_ok = [] - jobs_warning = [] - jobs_critical = [] - for job in jobs: - if check.job_is_ok(job): - jobs_ok.append(job) - elif check.job_is_warning(job): - jobs_warning.append(job) - else: - jobs_critical.append(job) - logger.debug("Raw status OK : {!r}".format(jobs_ok)) - logger.debug("Raw status WARN : {!r}".format(jobs_warning)) - logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical)) - if jobs_ok: - total_ok.append(jobs_ok[-1]) - elif jobs_warning: - total_warning.append(jobs_warning[-1]) - else: - total_critical.append(jobs_critical[-1]) + try: + status = CheckStatus(args, logger) + except CheckLoadError as exc: + print("UNKNOWN: Failed loading check from file '{!s}' ({!s})".format(exc.filename, exc.reason)) + return exit_status['UNKNOWN'] if args.cmd: # Single job check requested, output detailed information - if total_ok: - print('OK: {!s}'.format(total_ok[-1])) + if status.checks_ok: + print('OK: {!s}'.format(status.checks_ok[-1])) return exit_status['OK'] - if total_warning: - print('WARNING: {!s}'.format(total_warning[-1])) + if status.checks_warning: + print('WARNING: {!s}'.format(status.checks_warning[-1])) return exit_status['WARNING'] - if total_critical: - print('CRITICAL: {!s}'.format(total_critical[-1])) + if status.checks_critical: + print('CRITICAL: {!s}'.format(status.checks_critical[-1])) return exit_status['CRITICAL'] print "UNKNOWN - no jobs found for {!r}?".format(args.cmd) return exit_status['UNKNOWN'] - # When not looking at multiple jobs at once, logic gets a bit reversed - if ANY + # When looking at multiple jobs at once, logic gets a bit reversed - if ANY # job invocation is CRITICAL/WARNING, the aggregate message given to # Nagios will have to be a failure. - if total_critical: - print("CRITICAL: {num} job(s) in this state: {names}".format( - num = len(total_critical), - names = ', '.join([str(x.name) for x in total_critical]), - )) + if status.checks_critical: + print('CRITICAL: {!s}'.format( + _status_summary(status.num_jobs(), status.checks_critical))) return exit_status['CRITICAL'] - if total_warning: - print("WARNING: {num} job(s) in this state: {names}".format( - num = len(total_warning), - names = ', '.join([str(x.name) for x in total_warning]), - )) + if status.checks_warning: + print('WARNING: {!s}'.format( + _status_summary(status.num_jobs(), status.checks_warning))) return exit_status['WARNING'] - if total_ok: - print("OK: {num} job(s) in this state: {names}".format( - num = len(total_ok), - names = ', '.join([x.name for x in total_ok]), - )) + if status.checks_ok: + print('OK: {!s}'.format( + _status_summary(status.num_jobs(), status.checks_ok))) return exit_status['OK'] print "UNKNOWN - no jobs found?" return exit_status['UNKNOWN'] +def _status_summary(num_jobs, failed): + """ + String format routine used in output of checks status. + """ + fmt = '1 job in this state: {summary}' + if len(failed) == 1: + fmt = '{jobs}/{num_jobs} job in this state: {summary}' + + summary = ', '.join(sorted([str(x.status_summary()) for x in failed])) + return fmt.format(jobs = len(failed), + num_jobs = num_jobs, + summary = summary, + ) + + def _get_job_results(args, logger): """ Load all jobs matching any specified name on the command line. @@ -634,7 +768,10 @@ def _get_job_results(args, logger): if not this.endswith('.json'): continue filename = os.path.join(args.datadir, this) - job = job_from_file(filename) + try: + job = job_from_file(filename) + except JobLoadError as exc: + logger.warning("Failed loading job file '{!s}' ({!s})".format(exc.filename, exc.reason)) if args.cmd: if args.cmd[0] != job.name: logger.debug("Skipping '{!s}' not matching '{!s}' (file {!s})".format(job.name, args.cmd[0], filename)) @@ -666,6 +803,27 @@ def _parse_time_value(value): return num +def _time_to_str(value): + """ + Format number of seconds to short readable string. + + @type value: float or int + + @rtype: string + """ + if value < 1: + # milliseconds + return '{:0.3f}ms'.format(value * 1000) + if value < 60: + return '{!s}s'.format(int(value)) + if value < 3600: + return '{!s}m'.format(int(value)) + if value < 86400: + return '{!s}h'.format(int(value / 3600)) + days = int(value / 86400) + return '{!s}d{!s}h'.format(days, int((value % 86400) / 3600)) + + def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults): """ Main entry point for either wrapping a script, or checking the status of it. @@ -699,6 +857,10 @@ def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults syslog_h.setFormatter(formatter) logger.addHandler(syslog_h) + if args.name and args.mode != 'wrap': + logger.error('Argument --name only applicable for --mode wrap') + return False + if args.mode == 'wrap': return mode_wrap(args, logger) elif args.mode == 'ls': -- cgit v1.1