update
This commit is contained in:
@@ -0,0 +1,217 @@
|
||||
#!/usr/bin/python
|
||||
#-------------------------------------------------------------------------------------------------------------#
|
||||
# Author: Cliff Cogdill
|
||||
# Description: Gather the protection runs from the last 24 hours, limited to 5000 jobs, and seperate them into
|
||||
# Running, Canceled, Failures, warning, hold, and missed.
|
||||
# If the protection group is empty pause it.
|
||||
# If there are failures in the protection job, open a ServiceNow ticket.
|
||||
# Send out a summary of the previous 24 hour run
|
||||
#
|
||||
#
|
||||
#-------------------------------------------------------------------------------------------------------------#
|
||||
import sys,argparse,json,time,smtplib
|
||||
from email.message import EmailMessage
|
||||
sys.path.insert(0, './classes/')
|
||||
|
||||
import cohesityAPI as cohesity
|
||||
import serviceNowAPI as snow
|
||||
import automationsAPI as dashboard
|
||||
|
||||
def GetArgs():
|
||||
parser = argparse.ArgumentParser(add_help=False)
|
||||
parser.add_argument('--cluster', '-c', type=str, action='store')
|
||||
parser.add_argument('--vcenter', '-v', type=str, action='store')
|
||||
parser.add_argument('--job', '-j', type=str, action='store')
|
||||
parser.add_argument('--help', '-h', action='store_true')
|
||||
parser.add_argument('--debugMode', '-d', action='store_true')
|
||||
return (parser.parse_args())
|
||||
|
||||
def PrintHelp():
|
||||
print("\nBasic Usage:")
|
||||
print("\n python3 dailyErrors.py -c cluster1.domain.tld" )
|
||||
print("\t -c FQDN of Cohesity cluster address")
|
||||
print("\t -h Prints this help message")
|
||||
|
||||
def SendEmail(body, cluster):
|
||||
if debugMode:
|
||||
recipients = ['cecogdill@nd.gov']
|
||||
else:
|
||||
recipients = ['zmeier@nd.gov', 'cecogdill@nd.gov']
|
||||
|
||||
email = EmailMessage()
|
||||
email['Subject'] = "Cohesity Job Status for {0}".format(cluster)
|
||||
email['From'] = "No-Reply@nd.gov"
|
||||
email['To'] = ", ".join(recipients)
|
||||
email.set_content(body)
|
||||
|
||||
with smtplib.SMTP('apprelay.nd.gov') as smtp:
|
||||
smtp.send_message(email)
|
||||
|
||||
print("Sent email")
|
||||
|
||||
# Define variables
|
||||
args = GetArgs()
|
||||
|
||||
if args.debugMode:
|
||||
debugMode = True
|
||||
else:
|
||||
debugMode = False
|
||||
|
||||
# Check for arguments and act accoringly
|
||||
if args.help:
|
||||
PrintHelp()
|
||||
exit(1)
|
||||
|
||||
# Establish a connection to ServiceNow
|
||||
if debugMode:
|
||||
ticketSystem = snow.SnowAPI("northdakotadev.service-now.com")
|
||||
else:
|
||||
ticketSystem = snow.SnowAPI("northdakota.service-now.com")
|
||||
|
||||
# Establish a connection to Cohesity
|
||||
cluster = cohesity.API(args.cluster)
|
||||
authToken = cluster.GetAuthToken()
|
||||
cluster.UpdateHeaders(authToken['accessToken'])
|
||||
|
||||
# Get yesterday's start time in unixEpoch:
|
||||
prevWindow = cluster.GetRelativeTimestamp(-1, 17, 0, 0)
|
||||
|
||||
# Debug Output
|
||||
if args.debugMode:
|
||||
print("\nPulling backup jobs from {}\n".format(prevWindow))
|
||||
|
||||
# Pull a list of protection runs from yesterday's backup window.
|
||||
jobStatus = cluster.GetFilteredRequest("/public/protectionRuns","?startTimeUsecs=" + str(prevWindow) + "&numRuns=5000")
|
||||
|
||||
#Define the arrays we will use to sort the protectionRuns
|
||||
running = []
|
||||
cancel = []
|
||||
failures = []
|
||||
failControl = []
|
||||
warnings = []
|
||||
holds = []
|
||||
missed = []
|
||||
tickets = []
|
||||
|
||||
# Examine and sort the jobs by type status
|
||||
for job in jobStatus:
|
||||
|
||||
# Debug Output
|
||||
if args.debugMode:
|
||||
print("Status of {0}: {1}\n".format(job['jobName'],job['backupRun']['status']))
|
||||
|
||||
# Look for failures
|
||||
if job['backupRun']['status'] == "kRunning":
|
||||
running.append(job)
|
||||
elif job['backupRun']['status'] == "kCanceling" or job['backupRun']['status'] == "kCanceled":
|
||||
cancel.append(job)
|
||||
elif job['backupRun']['status'] == "kFailure":
|
||||
if job['jobName'] + job['backupRun']['error'] in failControl:
|
||||
continue
|
||||
else:
|
||||
failControl.append(job['jobName'] + job['backupRun']['error'])
|
||||
failures.append(job)
|
||||
elif job['backupRun']['status'] == "kWarning":
|
||||
warnings.append(job)
|
||||
elif job['backupRun']['status'] == "kOnHold":
|
||||
holds.append(job)
|
||||
elif job['backupRun']['status'] == "kMissed":
|
||||
missed.append(job)
|
||||
else:
|
||||
continue
|
||||
|
||||
# Start formatting the email output into the 'body' variable.
|
||||
body = "Running: " + str(len(running))
|
||||
for entry in running:
|
||||
message = "\n\t " + entry['jobName']
|
||||
body = body + message
|
||||
|
||||
body = body + "\nCanceled: " + str(len(cancel))
|
||||
for entry in cancel:
|
||||
message = "\n\t " + entry['jobName']
|
||||
body = body + message
|
||||
|
||||
body = body + "\nFailures: " + str(len(failures))
|
||||
for entry in failures:
|
||||
retired = False
|
||||
|
||||
# Create the message entry
|
||||
message = "\n\t " + entry['jobName']
|
||||
|
||||
if entry['jobName'] == 'NDPERS-Applications@physical':
|
||||
continue
|
||||
|
||||
if 'error' in entry['backupRun']:
|
||||
errorMessage = entry['backupRun']['error']
|
||||
|
||||
# We know there are empty protection jobs but we don't need to create a ticket on them; instead lets pause them and then
|
||||
# add logic to the daily backup audit script (dailyProtectionReview.py) to check the paused jobs for VMs and take action
|
||||
# if needed
|
||||
if errorMessage == "Cannot find any eligible backup source for this run":
|
||||
errorMessage = "Protection group was empty, future runs will be paused by the dailyProtectionReview.py script"
|
||||
#No need to continue and create a ticket so lets go to the next item
|
||||
continue
|
||||
|
||||
for source in entry['backupRun']['sourceBackupStatus']:
|
||||
|
||||
if 'error' in source:
|
||||
#errorMessage = "Details:"
|
||||
|
||||
# Clean up Cohesitys error a bit
|
||||
if "Exceeded the maximum number of permitted snapshots" in source['error']:
|
||||
errorMessage = "\n\t\t" + errorMessage + source['source']['name'] + ":\n\t\t\tAn error occurred while saving \
|
||||
the snapshot: Exceeded the maximum number of permitted snapshots. Check whether or not snapshots on \
|
||||
these objects are allowed in the VM properties. If snapshots are not allowed, ensure the server is \
|
||||
exempted from backups according to NDIT Backup Exemption procedures.\n"
|
||||
else:
|
||||
errorMessage = "\n\t\t" + source['source']['name'] + ":\n\t\t\t" + source['error']
|
||||
|
||||
|
||||
# Retired in ServiceNow
|
||||
this_vm = source['source']['name']
|
||||
cmdbRecord = ticketSystem.getCMDBItemByFQDN(this_vm)
|
||||
|
||||
# ServiceNow returns the index as string object so we either have to convert it to int or the check to str
|
||||
if len(cmdbRecord) > 0:
|
||||
if int(cmdbRecord[0]['operational_status']) == 6:
|
||||
retired = True
|
||||
|
||||
# Create the incident based off the error message in the job
|
||||
incident = ticketSystem.submitTicket("svccohesityadm", "Backup Error for: " + entry['jobName'], errorMessage)
|
||||
incidentID = incident['result']['sys_id']
|
||||
tickets.append(incident['result']['number'])
|
||||
|
||||
if '@SQL' in entry['jobName'] and retired != True:
|
||||
# Assign the incident to storage
|
||||
ticketSystem.assignTicketToGroup(incidentID, 'NDIT-Database')
|
||||
|
||||
else:
|
||||
# Assign the incident to storage
|
||||
ticketSystem.assignTicketToGroup(incidentID, 'NDIT-Computer Systems Storage')
|
||||
|
||||
message = message + errorMessage
|
||||
body = body + message
|
||||
|
||||
body = body + "\nWarning: " + str(len(warnings))
|
||||
for entry in warnings:
|
||||
message = "\n\t " + entry['jobName']
|
||||
body = body + message
|
||||
|
||||
body = body + "\nHold: " + str(len(holds))
|
||||
for entry in holds:
|
||||
message = "\n\t " + entry['jobName']
|
||||
body = body + message
|
||||
|
||||
body = body + "\nMissed: " + str(len(missed))
|
||||
for entry in missed:
|
||||
message = "\n\t " + entry['jobName']
|
||||
body = body + message
|
||||
|
||||
body = body + "\nTickets: " + str(len(tickets)) + "\n\t Instance: " + ticketSystem.snInstance
|
||||
for t in tickets:
|
||||
message = "\n\t\t" + t
|
||||
body = body + message
|
||||
|
||||
# Send the email to the list of recipients in the local SendEmail function
|
||||
SendEmail(body, args.cluster)
|
||||
dashboard.send_automation({'AutomationName': 'Infra-Cohesity', 'Action': 'Maintenance', 'Platform': 'Python-dailyErrors.py', 'Units': 60})
|
||||
Reference in New Issue
Block a user