218 lines
7.9 KiB
Python
218 lines
7.9 KiB
Python
#!/usr/bin/python
|
|
#-------------------------------------------------------------------------------------------------------------#
|
|
# Author: Cliff Cogdill
|
|
# Description: Gather the protection runs from the last 24 hours, limited to 5000 jobs, and seperate them into
|
|
# Running, Canceled, Failures, warning, hold, and missed.
|
|
# If the protection group is empty pause it.
|
|
# If there are failures in the protection job, open a ServiceNow ticket.
|
|
# Send out a summary of the previous 24 hour run
|
|
#
|
|
#
|
|
#-------------------------------------------------------------------------------------------------------------#
|
|
import sys,argparse,json,time,smtplib
|
|
from email.message import EmailMessage
|
|
sys.path.insert(0, './classes/')
|
|
|
|
import cohesityAPI as cohesity
|
|
import serviceNowAPI as snow
|
|
import automationsAPI as dashboard
|
|
|
|
def GetArgs():
|
|
parser = argparse.ArgumentParser(add_help=False)
|
|
parser.add_argument('--cluster', '-c', type=str, action='store')
|
|
parser.add_argument('--vcenter', '-v', type=str, action='store')
|
|
parser.add_argument('--job', '-j', type=str, action='store')
|
|
parser.add_argument('--help', '-h', action='store_true')
|
|
parser.add_argument('--debugMode', '-d', action='store_true')
|
|
return (parser.parse_args())
|
|
|
|
def PrintHelp():
|
|
print("\nBasic Usage:")
|
|
print("\n python3 dailyErrors.py -c cluster1.domain.tld" )
|
|
print("\t -c FQDN of Cohesity cluster address")
|
|
print("\t -h Prints this help message")
|
|
|
|
def SendEmail(body, cluster):
|
|
if debugMode:
|
|
recipients = ['cecogdill@nd.gov']
|
|
else:
|
|
recipients = ['zmeier@nd.gov', 'cecogdill@nd.gov']
|
|
|
|
email = EmailMessage()
|
|
email['Subject'] = "Cohesity Job Status for {0}".format(cluster)
|
|
email['From'] = "No-Reply@nd.gov"
|
|
email['To'] = ", ".join(recipients)
|
|
email.set_content(body)
|
|
|
|
with smtplib.SMTP('apprelay.nd.gov') as smtp:
|
|
smtp.send_message(email)
|
|
|
|
print("Sent email")
|
|
|
|
# Define variables
|
|
args = GetArgs()
|
|
|
|
if args.debugMode:
|
|
debugMode = True
|
|
else:
|
|
debugMode = False
|
|
|
|
# Check for arguments and act accoringly
|
|
if args.help:
|
|
PrintHelp()
|
|
exit(1)
|
|
|
|
# Establish a connection to ServiceNow
|
|
if debugMode:
|
|
ticketSystem = snow.SnowAPI("northdakotadev.service-now.com")
|
|
else:
|
|
ticketSystem = snow.SnowAPI("northdakota.service-now.com")
|
|
|
|
# Establish a connection to Cohesity
|
|
cluster = cohesity.API(args.cluster)
|
|
authToken = cluster.GetAuthToken()
|
|
cluster.UpdateHeaders(authToken['accessToken'])
|
|
|
|
# Get yesterday's start time in unixEpoch:
|
|
prevWindow = cluster.GetRelativeTimestamp(-1, 17, 0, 0)
|
|
|
|
# Debug Output
|
|
if args.debugMode:
|
|
print("\nPulling backup jobs from {}\n".format(prevWindow))
|
|
|
|
# Pull a list of protection runs from yesterday's backup window.
|
|
jobStatus = cluster.GetFilteredRequest("/public/protectionRuns","?startTimeUsecs=" + str(prevWindow) + "&numRuns=5000")
|
|
|
|
#Define the arrays we will use to sort the protectionRuns
|
|
running = []
|
|
cancel = []
|
|
failures = []
|
|
failControl = []
|
|
warnings = []
|
|
holds = []
|
|
missed = []
|
|
tickets = []
|
|
|
|
# Examine and sort the jobs by type status
|
|
for job in jobStatus:
|
|
|
|
# Debug Output
|
|
if args.debugMode:
|
|
print("Status of {0}: {1}\n".format(job['jobName'],job['backupRun']['status']))
|
|
|
|
# Look for failures
|
|
if job['backupRun']['status'] == "kRunning":
|
|
running.append(job)
|
|
elif job['backupRun']['status'] == "kCanceling" or job['backupRun']['status'] == "kCanceled":
|
|
cancel.append(job)
|
|
elif job['backupRun']['status'] == "kFailure":
|
|
if job['jobName'] + job['backupRun']['error'] in failControl:
|
|
continue
|
|
else:
|
|
failControl.append(job['jobName'] + job['backupRun']['error'])
|
|
failures.append(job)
|
|
elif job['backupRun']['status'] == "kWarning":
|
|
warnings.append(job)
|
|
elif job['backupRun']['status'] == "kOnHold":
|
|
holds.append(job)
|
|
elif job['backupRun']['status'] == "kMissed":
|
|
missed.append(job)
|
|
else:
|
|
continue
|
|
|
|
# Start formatting the email output into the 'body' variable.
|
|
body = "Running: " + str(len(running))
|
|
for entry in running:
|
|
message = "\n\t " + entry['jobName']
|
|
body = body + message
|
|
|
|
body = body + "\nCanceled: " + str(len(cancel))
|
|
for entry in cancel:
|
|
message = "\n\t " + entry['jobName']
|
|
body = body + message
|
|
|
|
body = body + "\nFailures: " + str(len(failures))
|
|
for entry in failures:
|
|
retired = False
|
|
|
|
# Create the message entry
|
|
message = "\n\t " + entry['jobName']
|
|
|
|
if entry['jobName'] == 'NDPERS-Applications@physical':
|
|
continue
|
|
|
|
if 'error' in entry['backupRun']:
|
|
errorMessage = entry['backupRun']['error']
|
|
|
|
# We know there are empty protection jobs but we don't need to create a ticket on them; instead lets pause them and then
|
|
# add logic to the daily backup audit script (dailyProtectionReview.py) to check the paused jobs for VMs and take action
|
|
# if needed
|
|
if errorMessage == "Cannot find any eligible backup source for this run":
|
|
errorMessage = "Protection group was empty, future runs will be paused by the dailyProtectionReview.py script"
|
|
#No need to continue and create a ticket so lets go to the next item
|
|
continue
|
|
|
|
for source in entry['backupRun']['sourceBackupStatus']:
|
|
|
|
if 'error' in source:
|
|
#errorMessage = "Details:"
|
|
|
|
# Clean up Cohesitys error a bit
|
|
if "Exceeded the maximum number of permitted snapshots" in source['error']:
|
|
errorMessage = "\n\t\t" + errorMessage + source['source']['name'] + ":\n\t\t\tAn error occurred while saving \
|
|
the snapshot: Exceeded the maximum number of permitted snapshots. Check whether or not snapshots on \
|
|
these objects are allowed in the VM properties. If snapshots are not allowed, ensure the server is \
|
|
exempted from backups according to NDIT Backup Exemption procedures.\n"
|
|
else:
|
|
errorMessage = "\n\t\t" + source['source']['name'] + ":\n\t\t\t" + source['error']
|
|
|
|
|
|
# Retired in ServiceNow
|
|
this_vm = source['source']['name']
|
|
cmdbRecord = ticketSystem.getCMDBItemByFQDN(this_vm)
|
|
|
|
# ServiceNow returns the index as string object so we either have to convert it to int or the check to str
|
|
if len(cmdbRecord) > 0:
|
|
if int(cmdbRecord[0]['operational_status']) == 6:
|
|
retired = True
|
|
|
|
# Create the incident based off the error message in the job
|
|
incident = ticketSystem.submitTicket("svccohesityadm", "Backup Error for: " + entry['jobName'], errorMessage)
|
|
incidentID = incident['result']['sys_id']
|
|
tickets.append(incident['result']['number'])
|
|
|
|
if '@SQL' in entry['jobName'] and retired != True:
|
|
# Assign the incident to storage
|
|
ticketSystem.assignTicketToGroup(incidentID, 'NDIT-Database')
|
|
|
|
else:
|
|
# Assign the incident to storage
|
|
ticketSystem.assignTicketToGroup(incidentID, 'NDIT-Computer Systems Storage')
|
|
|
|
message = message + errorMessage
|
|
body = body + message
|
|
|
|
body = body + "\nWarning: " + str(len(warnings))
|
|
for entry in warnings:
|
|
message = "\n\t " + entry['jobName']
|
|
body = body + message
|
|
|
|
body = body + "\nHold: " + str(len(holds))
|
|
for entry in holds:
|
|
message = "\n\t " + entry['jobName']
|
|
body = body + message
|
|
|
|
body = body + "\nMissed: " + str(len(missed))
|
|
for entry in missed:
|
|
message = "\n\t " + entry['jobName']
|
|
body = body + message
|
|
|
|
body = body + "\nTickets: " + str(len(tickets)) + "\n\t Instance: " + ticketSystem.snInstance
|
|
for t in tickets:
|
|
message = "\n\t\t" + t
|
|
body = body + message
|
|
|
|
# Send the email to the list of recipients in the local SendEmail function
|
|
SendEmail(body, args.cluster)
|
|
dashboard.send_automation({'AutomationName': 'Infra-Cohesity', 'Action': 'Maintenance', 'Platform': 'Python-dailyErrors.py', 'Units': 60})
|