File: //lib64/nagios/plugins/check_backuply.py
#!/usr/bin/env python3
"""
Backuply Health Check Plugin for Icinga
Description:
- Checks if backup servers are configured
- Verifies backup jobs exist
- Monitors recent backup job executions via master process data
- Reports per-job success/failed/warnings/running counts with perfdata
- Lists failed user accounts in output
Usage:
./check_backuply.py [--chours 24] [--whours 12]
Exit Codes:
0 OK
1 WARNING
2 CRITICAL
3 UNKNOWN
"""
import os
import sys
import shutil
import argparse
import time
import sqlite3
import json
import subprocess
def check_backup_servers():
"""Check if backup servers are configured"""
config_path = "/var/backuply/conf/backup_servers.json"
if not os.path.exists(config_path):
print(f"CRITICAL: Backup servers config not found at {config_path}")
return 2
try:
with open(config_path, 'r') as f:
backup_servers = json.load(f)
except (json.JSONDecodeError, IOError) as e:
print(f"CRITICAL: Failed to read backup servers config: {e}")
return 2
if not backup_servers:
print("CRITICAL: No backup servers configured")
return 2
print(f"OK: {len(backup_servers)} backup server(s) configured")
return 0
def check_backup_jobs():
"""Check if backup jobs are configured"""
config_path = "/var/backuply/conf/backup.json"
if not os.path.exists(config_path):
print(f"CRITICAL: Backup jobs config not found at {config_path}")
return 2
try:
with open(config_path, 'r') as f:
backup_jobs = json.load(f)
except (json.JSONDecodeError, IOError) as e:
print(f"CRITICAL: Failed to read backup jobs config: {e}")
return 2
if not backup_jobs:
print("CRITICAL: No backup jobs configured")
return 2
active_jobs = [job for job in backup_jobs if job.get('schedule_status') == 1]
if not active_jobs:
print("WARNING: No active backup jobs found")
return 1
print(f"OK: {len(active_jobs)} active backup job(s) configured")
return 0
def parse_job_stats(data_str):
"""Parse master process data JSON into job-level stats.
Status codes in schedule.users per-user dict:
1 = success
0 = failed
-100 = completed with warnings
-1 / [] (empty list) = running or queued
"""
try:
data = json.loads(data_str)
except (json.JSONDecodeError, TypeError):
return None
schedule = data.get('schedule', {})
users = schedule.get('users', {})
total_proc = schedule.get('total_proc', 0)
failed_users = []
success = failed = warnings = 0
for username, u in users.items():
if not isinstance(u, dict):
continue
st = u.get('status')
if st == 1:
success += 1
elif st == 0:
failed += 1
failed_users.append(username)
elif st == -100:
warnings += 1
running = total_proc - success - failed - warnings
return {
'job_name': schedule.get('schedule_name', 'Unknown'),
'total_proc': total_proc,
'success': success,
'failed': failed,
'warnings': warnings,
'running': running,
'failed_users': failed_users,
}
def check_recent_tasks():
"""Check recent backup job executions via master process data in SQLite."""
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("--chours", type=int, default=24, help="Critical hours threshold (default: 24)")
parser.add_argument("--whours", type=int, default=12, help="Warning hours threshold (default: 12)")
args, _ = parser.parse_known_args()
def get_current_users_count():
try:
if os.path.exists("/usr/local/cpanel/cpanel"):
command = "whmapi1 --output=jsonpretty get_current_users_count"
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
output, _ = process.communicate()
data = json.loads(output)
return data['data']['users']
except:
pass
users_count = get_current_users_count()
critical_hours = args.chours
warning_hours = args.whours
db_path = "/var/backuply/db/tasks.db"
if not os.path.exists(db_path):
print(f"WARNING: Database not found at {db_path}")
return 1
now = int(time.time())
critical_threshold = now - (critical_hours * 3600)
warning_threshold = now - (warning_hours * 3600)
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
try:
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.read().split()[0])
except (IOError, ValueError):
uptime_seconds = float('inf')
cursor.execute("""
SELECT 1
FROM tasks
WHERE action = 'admin_backup_master_process'
LIMIT 1
""")
has_backup_history = cursor.fetchone() is not None
if uptime_seconds < (24 * 3600) and not has_backup_history:
uptime_hours = uptime_seconds / 3600
print(f"OK: New server - uptime {uptime_hours:.1f}h and no backup history")
conn.close()
return 0
cursor.execute("""
SELECT actid, data, created
FROM tasks
WHERE action = 'admin_backup_master_process' AND created > ?
ORDER BY created DESC
""", (critical_threshold,))
master_tasks = cursor.fetchall()
conn.close()
except sqlite3.Error as e:
print(f"CRITICAL: Failed to query database: {e}")
return 2
if not master_tasks and users_count is not None and users_count > 0:
print(f"CRITICAL: No backup jobs executed in last {critical_hours}h but {users_count} user(s) exist")
return 2
if not master_tasks:
print(f"OK: No backup jobs in last {critical_hours}h")
return 0
jobs = []
for actid, data_str, created in master_tasks:
stats = parse_job_stats(data_str)
if stats is None:
continue
stats['actid'] = actid
stats['created'] = created
jobs.append(stats)
if not jobs:
print(f"OK: {len(master_tasks)} backup job(s) in last {critical_hours}h (no parseable data)")
return 0
critical_failed = [j for j in jobs if j['failed'] > 0 and j['created'] > warning_threshold]
warning_failed = [j for j in jobs if j['failed'] > 0 and warning_threshold >= j['created'] > critical_threshold]
warning_only = [j for j in jobs if j['warnings'] > 0 and j['failed'] == 0 and j['created'] > warning_threshold]
latest = jobs[0]
perfdata = (
f"| success={latest['success']};;;0;{latest['total_proc']}"
f" failed={latest['failed']};;;0;{latest['total_proc']}"
f" warnings={latest['warnings']};;;0;{latest['total_proc']}"
f" running={latest['running']};;;0;{latest['total_proc']}"
f" total={latest['total_proc']}"
)
if critical_failed:
job = critical_failed[0]
sample = ', '.join(job['failed_users'][:5])
extra = f" (+{len(job['failed_users']) - 5} more)" if len(job['failed_users']) > 5 else ""
print(
f"CRITICAL: {job['job_name']}(id:{job['actid']})"
f" {job['failed']}/{job['total_proc']} failed"
f" [{sample}{extra}],"
f" {job['success']} ok, {job['warnings']} warn"
f" {perfdata}"
)
return 2
if warning_failed:
job = warning_failed[0]
sample = ', '.join(job['failed_users'][:5])
extra = f" (+{len(job['failed_users']) - 5} more)" if len(job['failed_users']) > 5 else ""
print(
f"WARNING: {job['job_name']}(id:{job['actid']})"
f" {job['failed']}/{job['total_proc']} failed"
f" [{sample}{extra}],"
f" {job['success']} ok"
f" {perfdata}"
)
return 1
if warning_only:
job = warning_only[0]
print(
f"WARNING: {job['job_name']}(id:{job['actid']})"
f" {job['warnings']}/{job['total_proc']} warnings,"
f" {job['success']} ok"
f" {perfdata}"
)
return 1
msg = (
f"OK: {len(jobs)} job(s) in {critical_hours}h,"
f" latest {latest['job_name']}"
f" {latest['success']}/{latest['total_proc']} ok"
)
if latest['running'] > 0:
msg += f" ({latest['running']} running)"
if users_count is not None:
msg += f", {users_count} users"
print(f"{msg} {perfdata}")
return 0
if __name__ == "__main__":
if not shutil.which("backuply"):
print("OK: Backuply is not installed on this server")
sys.exit(0)
checks = [check_backup_servers, check_backup_jobs, check_recent_tasks]
for check_fn in checks:
result = check_fn()
if result != 0:
sys.exit(result)