HEX
Server: LiteSpeed
System: Linux s3604.bom1.stableserver.net 4.18.0-513.11.1.lve.el8.x86_64 #1 SMP Thu Jan 18 16:21:02 UTC 2024 x86_64
User: dmstechonline (1480)
PHP: 7.4.33
Disabled: NONE
Upload Files
File: //lib64/nagios/plugins/check_backuply.py
#!/usr/bin/env python3
"""
Backuply Health Check Plugin for Icinga

Description:
    - Checks if backup servers are configured
    - Verifies backup jobs exist
    - Monitors recent backup job executions via master process data
    - Reports per-job success/failed/warnings/running counts with perfdata
    - Lists failed user accounts in output

Usage:
    ./check_backuply.py [--chours 24] [--whours 12]

Exit Codes:
    0 OK
    1 WARNING
    2 CRITICAL
    3 UNKNOWN
"""

import os
import sys
import shutil
import argparse
import time
import sqlite3
import json
import subprocess

def check_backup_servers():
    """Check if backup servers are configured"""
    config_path = "/var/backuply/conf/backup_servers.json"

    if not os.path.exists(config_path):
        print(f"CRITICAL: Backup servers config not found at {config_path}")
        return 2

    try:
        with open(config_path, 'r') as f:
            backup_servers = json.load(f)

    except (json.JSONDecodeError, IOError) as e:
        print(f"CRITICAL: Failed to read backup servers config: {e}")
        return 2

    if not backup_servers:
        print("CRITICAL: No backup servers configured")
        return 2

    print(f"OK: {len(backup_servers)} backup server(s) configured")
    return 0

def check_backup_jobs():
    """Check if backup jobs are configured"""
    config_path = "/var/backuply/conf/backup.json"

    if not os.path.exists(config_path):
        print(f"CRITICAL: Backup jobs config not found at {config_path}")
        return 2

    try:
        with open(config_path, 'r') as f:
            backup_jobs = json.load(f)

    except (json.JSONDecodeError, IOError) as e:
        print(f"CRITICAL: Failed to read backup jobs config: {e}")
        return 2

    if not backup_jobs:
        print("CRITICAL: No backup jobs configured")
        return 2

    active_jobs = [job for job in backup_jobs if job.get('schedule_status') == 1]
    if not active_jobs:
        print("WARNING: No active backup jobs found")
        return 1

    print(f"OK: {len(active_jobs)} active backup job(s) configured")
    return 0

def parse_job_stats(data_str):
    """Parse master process data JSON into job-level stats.

    Status codes in schedule.users per-user dict:
        1  = success
        0  = failed
       -100 = completed with warnings
       -1 / [] (empty list) = running or queued
    """
    try:
        data = json.loads(data_str)
    except (json.JSONDecodeError, TypeError):
        return None

    schedule = data.get('schedule', {})
    users = schedule.get('users', {})
    total_proc = schedule.get('total_proc', 0)

    failed_users = []
    success = failed = warnings = 0

    for username, u in users.items():
        if not isinstance(u, dict):
            continue
        st = u.get('status')
        if st == 1:
            success += 1
        elif st == 0:
            failed += 1
            failed_users.append(username)
        elif st == -100:
            warnings += 1

    running = total_proc - success - failed - warnings

    return {
        'job_name': schedule.get('schedule_name', 'Unknown'),
        'total_proc': total_proc,
        'success': success,
        'failed': failed,
        'warnings': warnings,
        'running': running,
        'failed_users': failed_users,
    }


def check_recent_tasks():
    """Check recent backup job executions via master process data in SQLite."""
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--chours", type=int, default=24, help="Critical hours threshold (default: 24)")
    parser.add_argument("--whours", type=int, default=12, help="Warning hours threshold (default: 12)")
    args, _ = parser.parse_known_args()

    def get_current_users_count():
        try:
            if os.path.exists("/usr/local/cpanel/cpanel"):
                command = "whmapi1 --output=jsonpretty get_current_users_count"
                process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
                output, _ = process.communicate()
                data = json.loads(output)
                return data['data']['users']
        except:
            pass

    users_count = get_current_users_count()
    critical_hours = args.chours
    warning_hours = args.whours

    db_path = "/var/backuply/db/tasks.db"

    if not os.path.exists(db_path):
        print(f"WARNING: Database not found at {db_path}")
        return 1

    now = int(time.time())
    critical_threshold = now - (critical_hours * 3600)
    warning_threshold = now - (warning_hours * 3600)

    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()

        try:
            with open('/proc/uptime', 'r') as f:
                uptime_seconds = float(f.read().split()[0])
        except (IOError, ValueError):
            uptime_seconds = float('inf')

        cursor.execute("""
            SELECT 1
            FROM tasks
            WHERE action = 'admin_backup_master_process'
            LIMIT 1
        """)
        has_backup_history = cursor.fetchone() is not None

        if uptime_seconds < (24 * 3600) and not has_backup_history:
            uptime_hours = uptime_seconds / 3600
            print(f"OK: New server - uptime {uptime_hours:.1f}h and no backup history")
            conn.close()
            return 0

        cursor.execute("""
            SELECT actid, data, created
            FROM tasks
            WHERE action = 'admin_backup_master_process' AND created > ?
            ORDER BY created DESC
        """, (critical_threshold,))

        master_tasks = cursor.fetchall()
        conn.close()

    except sqlite3.Error as e:
        print(f"CRITICAL: Failed to query database: {e}")
        return 2

    if not master_tasks and users_count is not None and users_count > 0:
        print(f"CRITICAL: No backup jobs executed in last {critical_hours}h but {users_count} user(s) exist")
        return 2

    if not master_tasks:
        print(f"OK: No backup jobs in last {critical_hours}h")
        return 0

    jobs = []
    for actid, data_str, created in master_tasks:
        stats = parse_job_stats(data_str)
        if stats is None:
            continue
        stats['actid'] = actid
        stats['created'] = created
        jobs.append(stats)

    if not jobs:
        print(f"OK: {len(master_tasks)} backup job(s) in last {critical_hours}h (no parseable data)")
        return 0

    critical_failed = [j for j in jobs if j['failed'] > 0 and j['created'] > warning_threshold]
    warning_failed = [j for j in jobs if j['failed'] > 0 and warning_threshold >= j['created'] > critical_threshold]
    warning_only = [j for j in jobs if j['warnings'] > 0 and j['failed'] == 0 and j['created'] > warning_threshold]

    latest = jobs[0]
    perfdata = (
        f"| success={latest['success']};;;0;{latest['total_proc']}"
        f" failed={latest['failed']};;;0;{latest['total_proc']}"
        f" warnings={latest['warnings']};;;0;{latest['total_proc']}"
        f" running={latest['running']};;;0;{latest['total_proc']}"
        f" total={latest['total_proc']}"
    )

    if critical_failed:
        job = critical_failed[0]
        sample = ', '.join(job['failed_users'][:5])
        extra = f" (+{len(job['failed_users']) - 5} more)" if len(job['failed_users']) > 5 else ""
        print(
            f"CRITICAL: {job['job_name']}(id:{job['actid']})"
            f" {job['failed']}/{job['total_proc']} failed"
            f" [{sample}{extra}],"
            f" {job['success']} ok, {job['warnings']} warn"
            f" {perfdata}"
        )
        return 2

    if warning_failed:
        job = warning_failed[0]
        sample = ', '.join(job['failed_users'][:5])
        extra = f" (+{len(job['failed_users']) - 5} more)" if len(job['failed_users']) > 5 else ""
        print(
            f"WARNING: {job['job_name']}(id:{job['actid']})"
            f" {job['failed']}/{job['total_proc']} failed"
            f" [{sample}{extra}],"
            f" {job['success']} ok"
            f" {perfdata}"
        )
        return 1

    if warning_only:
        job = warning_only[0]
        print(
            f"WARNING: {job['job_name']}(id:{job['actid']})"
            f" {job['warnings']}/{job['total_proc']} warnings,"
            f" {job['success']} ok"
            f" {perfdata}"
        )
        return 1

    msg = (
        f"OK: {len(jobs)} job(s) in {critical_hours}h,"
        f" latest {latest['job_name']}"
        f" {latest['success']}/{latest['total_proc']} ok"
    )
    if latest['running'] > 0:
        msg += f" ({latest['running']} running)"
    if users_count is not None:
        msg += f", {users_count} users"
    print(f"{msg} {perfdata}")
    return 0

if __name__ == "__main__":
    if not shutil.which("backuply"):
        print("OK: Backuply is not installed on this server")
        sys.exit(0)

    checks = [check_backup_servers, check_backup_jobs, check_recent_tasks]

    for check_fn in checks:
        result = check_fn()
        if result != 0:
            sys.exit(result)