#!/usr/bin/python #-------------------------------------------------------------------------------------------------------------# # Author: Cliff Cogdill # Description: Gather the protection runs from the last 24 hours, limited to 5000 jobs, and seperate them into # Running, Canceled, Failures, warning, hold, and missed. # If the protection group is empty pause it. # If there are failures in the protection job, open a ServiceNow ticket. # Send out a summary of the previous 24 hour run # # #-------------------------------------------------------------------------------------------------------------# import sys,argparse,json,time,smtplib from email.message import EmailMessage sys.path.insert(0, './classes/') import cohesityAPI as cohesity import serviceNowAPI as snow import automationsAPI as dashboard def GetArgs(): parser = argparse.ArgumentParser(add_help=False) parser.add_argument('--cluster', '-c', type=str, action='store') parser.add_argument('--vcenter', '-v', type=str, action='store') parser.add_argument('--job', '-j', type=str, action='store') parser.add_argument('--help', '-h', action='store_true') parser.add_argument('--debugMode', '-d', action='store_true') return (parser.parse_args()) def PrintHelp(): print("\nBasic Usage:") print("\n python3 dailyErrors.py -c cluster1.domain.tld" ) print("\t -c FQDN of Cohesity cluster address") print("\t -h Prints this help message") def SendEmail(body, cluster): if debugMode: recipients = ['cecogdill@nd.gov'] else: recipients = ['zmeier@nd.gov', 'cecogdill@nd.gov'] email = EmailMessage() email['Subject'] = "Cohesity Job Status for {0}".format(cluster) email['From'] = "No-Reply@nd.gov" email['To'] = ", ".join(recipients) email.set_content(body) with smtplib.SMTP('apprelay.nd.gov') as smtp: smtp.send_message(email) print("Sent email") # Define variables args = GetArgs() if args.debugMode: debugMode = True else: debugMode = False # Check for arguments and act accoringly if args.help: PrintHelp() exit(1) # Establish a connection to ServiceNow if debugMode: ticketSystem = snow.SnowAPI("northdakotadev.service-now.com") else: ticketSystem = snow.SnowAPI("northdakota.service-now.com") # Establish a connection to Cohesity cluster = cohesity.API(args.cluster) authToken = cluster.GetAuthToken() cluster.UpdateHeaders(authToken['accessToken']) # Get yesterday's start time in unixEpoch: prevWindow = cluster.GetRelativeTimestamp(-1, 17, 0, 0) # Debug Output if args.debugMode: print("\nPulling backup jobs from {}\n".format(prevWindow)) # Pull a list of protection runs from yesterday's backup window. jobStatus = cluster.GetFilteredRequest("/public/protectionRuns","?startTimeUsecs=" + str(prevWindow) + "&numRuns=5000") #Define the arrays we will use to sort the protectionRuns running = [] cancel = [] failures = [] failControl = [] warnings = [] holds = [] missed = [] tickets = [] # Examine and sort the jobs by type status for job in jobStatus: # Debug Output if args.debugMode: print("Status of {0}: {1}\n".format(job['jobName'],job['backupRun']['status'])) # Look for failures if job['backupRun']['status'] == "kRunning": running.append(job) elif job['backupRun']['status'] == "kCanceling" or job['backupRun']['status'] == "kCanceled": cancel.append(job) elif job['backupRun']['status'] == "kFailure": if job['jobName'] + job['backupRun']['error'] in failControl: continue else: failControl.append(job['jobName'] + job['backupRun']['error']) failures.append(job) elif job['backupRun']['status'] == "kWarning": warnings.append(job) elif job['backupRun']['status'] == "kOnHold": holds.append(job) elif job['backupRun']['status'] == "kMissed": missed.append(job) else: continue # Start formatting the email output into the 'body' variable. body = "Running: " + str(len(running)) for entry in running: message = "\n\t " + entry['jobName'] body = body + message body = body + "\nCanceled: " + str(len(cancel)) for entry in cancel: message = "\n\t " + entry['jobName'] body = body + message body = body + "\nFailures: " + str(len(failures)) for entry in failures: retired = False # Create the message entry message = "\n\t " + entry['jobName'] if entry['jobName'] == 'NDPERS-Applications@physical': continue if 'error' in entry['backupRun']: errorMessage = entry['backupRun']['error'] # We know there are empty protection jobs but we don't need to create a ticket on them; instead lets pause them and then # add logic to the daily backup audit script (dailyProtectionReview.py) to check the paused jobs for VMs and take action # if needed if errorMessage == "Cannot find any eligible backup source for this run": errorMessage = "Protection group was empty, future runs will be paused by the dailyProtectionReview.py script" #No need to continue and create a ticket so lets go to the next item continue for source in entry['backupRun']['sourceBackupStatus']: if 'error' in source: #errorMessage = "Details:" # Clean up Cohesitys error a bit if "Exceeded the maximum number of permitted snapshots" in source['error']: errorMessage = "\n\t\t" + errorMessage + source['source']['name'] + ":\n\t\t\tAn error occurred while saving \ the snapshot: Exceeded the maximum number of permitted snapshots. Check whether or not snapshots on \ these objects are allowed in the VM properties. If snapshots are not allowed, ensure the server is \ exempted from backups according to NDIT Backup Exemption procedures.\n" else: errorMessage = "\n\t\t" + source['source']['name'] + ":\n\t\t\t" + source['error'] # Retired in ServiceNow this_vm = source['source']['name'] cmdbRecord = ticketSystem.getCMDBItemByFQDN(this_vm) # ServiceNow returns the index as string object so we either have to convert it to int or the check to str if len(cmdbRecord) > 0: if int(cmdbRecord[0]['operational_status']) == 6: retired = True # Create the incident based off the error message in the job incident = ticketSystem.submitTicket("svccohesityadm", "Backup Error for: " + entry['jobName'], errorMessage) incidentID = incident['result']['sys_id'] tickets.append(incident['result']['number']) if '@SQL' in entry['jobName'] and retired != True: # Assign the incident to storage ticketSystem.assignTicketToGroup(incidentID, 'NDIT-Database') else: # Assign the incident to storage ticketSystem.assignTicketToGroup(incidentID, 'NDIT-Computer Systems Storage') message = message + errorMessage body = body + message body = body + "\nWarning: " + str(len(warnings)) for entry in warnings: message = "\n\t " + entry['jobName'] body = body + message body = body + "\nHold: " + str(len(holds)) for entry in holds: message = "\n\t " + entry['jobName'] body = body + message body = body + "\nMissed: " + str(len(missed)) for entry in missed: message = "\n\t " + entry['jobName'] body = body + message body = body + "\nTickets: " + str(len(tickets)) + "\n\t Instance: " + ticketSystem.snInstance for t in tickets: message = "\n\t\t" + t body = body + message # Send the email to the list of recipients in the local SendEmail function SendEmail(body, args.cluster) dashboard.send_automation({'AutomationName': 'Infra-Cohesity', 'Action': 'Maintenance', 'Platform': 'Python-dailyErrors.py', 'Units': 60})