Files
Zack Meier 1d304511b8 update
2026-04-15 15:45:50 -05:00

218 lines
7.9 KiB
Python

#!/usr/bin/python
#-------------------------------------------------------------------------------------------------------------#
# Author: Cliff Cogdill
# Description: Gather the protection runs from the last 24 hours, limited to 5000 jobs, and seperate them into
# Running, Canceled, Failures, warning, hold, and missed.
# If the protection group is empty pause it.
# If there are failures in the protection job, open a ServiceNow ticket.
# Send out a summary of the previous 24 hour run
#
#
#-------------------------------------------------------------------------------------------------------------#
import sys,argparse,json,time,smtplib
from email.message import EmailMessage
sys.path.insert(0, './classes/')
import cohesityAPI as cohesity
import serviceNowAPI as snow
import automationsAPI as dashboard
def GetArgs():
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('--cluster', '-c', type=str, action='store')
parser.add_argument('--vcenter', '-v', type=str, action='store')
parser.add_argument('--job', '-j', type=str, action='store')
parser.add_argument('--help', '-h', action='store_true')
parser.add_argument('--debugMode', '-d', action='store_true')
return (parser.parse_args())
def PrintHelp():
print("\nBasic Usage:")
print("\n python3 dailyErrors.py -c cluster1.domain.tld" )
print("\t -c FQDN of Cohesity cluster address")
print("\t -h Prints this help message")
def SendEmail(body, cluster):
if debugMode:
recipients = ['cecogdill@nd.gov']
else:
recipients = ['zmeier@nd.gov', 'cecogdill@nd.gov']
email = EmailMessage()
email['Subject'] = "Cohesity Job Status for {0}".format(cluster)
email['From'] = "No-Reply@nd.gov"
email['To'] = ", ".join(recipients)
email.set_content(body)
with smtplib.SMTP('apprelay.nd.gov') as smtp:
smtp.send_message(email)
print("Sent email")
# Define variables
args = GetArgs()
if args.debugMode:
debugMode = True
else:
debugMode = False
# Check for arguments and act accoringly
if args.help:
PrintHelp()
exit(1)
# Establish a connection to ServiceNow
if debugMode:
ticketSystem = snow.SnowAPI("northdakotadev.service-now.com")
else:
ticketSystem = snow.SnowAPI("northdakota.service-now.com")
# Establish a connection to Cohesity
cluster = cohesity.API(args.cluster)
authToken = cluster.GetAuthToken()
cluster.UpdateHeaders(authToken['accessToken'])
# Get yesterday's start time in unixEpoch:
prevWindow = cluster.GetRelativeTimestamp(-1, 17, 0, 0)
# Debug Output
if args.debugMode:
print("\nPulling backup jobs from {}\n".format(prevWindow))
# Pull a list of protection runs from yesterday's backup window.
jobStatus = cluster.GetFilteredRequest("/public/protectionRuns","?startTimeUsecs=" + str(prevWindow) + "&numRuns=5000")
#Define the arrays we will use to sort the protectionRuns
running = []
cancel = []
failures = []
failControl = []
warnings = []
holds = []
missed = []
tickets = []
# Examine and sort the jobs by type status
for job in jobStatus:
# Debug Output
if args.debugMode:
print("Status of {0}: {1}\n".format(job['jobName'],job['backupRun']['status']))
# Look for failures
if job['backupRun']['status'] == "kRunning":
running.append(job)
elif job['backupRun']['status'] == "kCanceling" or job['backupRun']['status'] == "kCanceled":
cancel.append(job)
elif job['backupRun']['status'] == "kFailure":
if job['jobName'] + job['backupRun']['error'] in failControl:
continue
else:
failControl.append(job['jobName'] + job['backupRun']['error'])
failures.append(job)
elif job['backupRun']['status'] == "kWarning":
warnings.append(job)
elif job['backupRun']['status'] == "kOnHold":
holds.append(job)
elif job['backupRun']['status'] == "kMissed":
missed.append(job)
else:
continue
# Start formatting the email output into the 'body' variable.
body = "Running: " + str(len(running))
for entry in running:
message = "\n\t " + entry['jobName']
body = body + message
body = body + "\nCanceled: " + str(len(cancel))
for entry in cancel:
message = "\n\t " + entry['jobName']
body = body + message
body = body + "\nFailures: " + str(len(failures))
for entry in failures:
retired = False
# Create the message entry
message = "\n\t " + entry['jobName']
if entry['jobName'] == 'NDPERS-Applications@physical':
continue
if 'error' in entry['backupRun']:
errorMessage = entry['backupRun']['error']
# We know there are empty protection jobs but we don't need to create a ticket on them; instead lets pause them and then
# add logic to the daily backup audit script (dailyProtectionReview.py) to check the paused jobs for VMs and take action
# if needed
if errorMessage == "Cannot find any eligible backup source for this run":
errorMessage = "Protection group was empty, future runs will be paused by the dailyProtectionReview.py script"
#No need to continue and create a ticket so lets go to the next item
continue
for source in entry['backupRun']['sourceBackupStatus']:
if 'error' in source:
#errorMessage = "Details:"
# Clean up Cohesitys error a bit
if "Exceeded the maximum number of permitted snapshots" in source['error']:
errorMessage = "\n\t\t" + errorMessage + source['source']['name'] + ":\n\t\t\tAn error occurred while saving \
the snapshot: Exceeded the maximum number of permitted snapshots. Check whether or not snapshots on \
these objects are allowed in the VM properties. If snapshots are not allowed, ensure the server is \
exempted from backups according to NDIT Backup Exemption procedures.\n"
else:
errorMessage = "\n\t\t" + source['source']['name'] + ":\n\t\t\t" + source['error']
# Retired in ServiceNow
this_vm = source['source']['name']
cmdbRecord = ticketSystem.getCMDBItemByFQDN(this_vm)
# ServiceNow returns the index as string object so we either have to convert it to int or the check to str
if len(cmdbRecord) > 0:
if int(cmdbRecord[0]['operational_status']) == 6:
retired = True
# Create the incident based off the error message in the job
incident = ticketSystem.submitTicket("svccohesityadm", "Backup Error for: " + entry['jobName'], errorMessage)
incidentID = incident['result']['sys_id']
tickets.append(incident['result']['number'])
if '@SQL' in entry['jobName'] and retired != True:
# Assign the incident to storage
ticketSystem.assignTicketToGroup(incidentID, 'NDIT-Database')
else:
# Assign the incident to storage
ticketSystem.assignTicketToGroup(incidentID, 'NDIT-Computer Systems Storage')
message = message + errorMessage
body = body + message
body = body + "\nWarning: " + str(len(warnings))
for entry in warnings:
message = "\n\t " + entry['jobName']
body = body + message
body = body + "\nHold: " + str(len(holds))
for entry in holds:
message = "\n\t " + entry['jobName']
body = body + message
body = body + "\nMissed: " + str(len(missed))
for entry in missed:
message = "\n\t " + entry['jobName']
body = body + message
body = body + "\nTickets: " + str(len(tickets)) + "\n\t Instance: " + ticketSystem.snInstance
for t in tickets:
message = "\n\t\t" + t
body = body + message
# Send the email to the list of recipients in the local SendEmail function
SendEmail(body, args.cluster)
dashboard.send_automation({'AutomationName': 'Infra-Cohesity', 'Action': 'Maintenance', 'Platform': 'Python-dailyErrors.py', 'Units': 60})