#!/bin/env python #Parses MySQL general query logs #Written by Brandon Whaley (redkrieg@gmail.com, http://redkrieg.com) #Current version, 20110517230708 #Changelog: #20110510010305 Added support for user specified regex matching #20110512160308 Added column based output #20110515005547 Implemented smart detection of query_id re-use and handle appropriately #20110517230708 Changed sort to be a total of all queries in all columns #20110625143105 Added per-user query output #20110625151042 Changed to use #!/bin/env python #20110625153237 Suppressed error when printing per-user data to a broken pipe ( - | head -1) import sys import re import datetime from optparse import OptionParser parser = OptionParser(usage="usage: %prog [options] [filename]\n\n[filename] is optional and defaults to stdin.") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=True, help="Print info on stderr (default: %default)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", help="Suppress stderr output") parser.add_option("-o", "--output", metavar="FILE", default=None, help="Write output to FILE (default: stdout)") parser.add_option("-r", "--regex", metavar="REGEX", default=None, help="Tally arbitrary REGEX string (slow)") parser.add_option("-u", "--user", metavar="USER", default=None, help="Output USER's queries instead of tallys") (options, args) = parser.parse_args() class MySQLUser: """Holds a user name and tracks numbers of queries""" def __init__(self, userName): self.user = userName self.numSelect = 0 self.numInsert = 0 self.numUpdate = 0 self.numRegex = 0 def addSelect(self): self.numSelect += 1 def addInsert(self): self.numInsert += 1 def addUpdate(self): self.numUpdate += 1 def addRegex(self): self.numRegex += 1 #functions to sort a user entry list (list of tuples of form (user_name string, MySQLUser class)) def sort_select(user_entry_list): #sort by selects return sorted(user_entry_list, key=lambda user_entry: user_entry[1].numSelect) #uses a lambda function to make the sort key an entity within the MySQLUser class def sort_insert(user_entry_list): return sorted(user_entry_list, key=lambda user_entry: user_entry[1].numInsert) def sort_update(user_entry_list): return sorted(user_entry_list, key=lambda user_entry: user_entry[1].numUpdate) def sort_regex(user_entry_list): return sorted(user_entry_list, key=lambda user_entry: user_entry[1].numRegex) def sort_total(user_entry_list): return sorted(user_entry_list, key=lambda user_entry: user_entry[1].numSelect + user_entry[1].numInsert + user_entry[1].numUpdate + user_entry[1].numRegex) #user_entry_list.sort(key=lambda user_entry: user_entry[1].numSelect + user_entry[1].numInsert + user_entry[1].numUpdate + user_entry[1].numRegex) #return user_entry_list def MySqlTimestampToDateTime(timestamp): #convert mysql timestamp to datetime object timelist = map(int, re.findall("[0-9]{1,2}", timestamp)) #get every instance of one to two digits (preferring two) together (non-overlapping) and make them a list, then turn each entity in the list to integers timelist[0] += 2000 #this will break when we have to deal with the y2100 bug. I'm lookin' at YOU youngster!!! return apply(datetime.datetime, timelist) #take the values in timelist and make them arguments to the datetime constructor, then return the datetime object. Pwnage def getTotalSeconds(time_delta): #in python 2.7 I could do this much more easily with this: # return time_delta.total_seconds() #but alas, backward compatability :( return (time_delta.microseconds + (time_delta.seconds + time_delta.days * 24 * 3600) * 10**6) / 10**6 #made this a function try: queryLog = open(args[0], "r") #try to open the specified log file except: #if the log file can't be opened, just open stdin instead if options.verbose: print >> sys.stderr, "MySQL general query log parser reading from stdin/pipe..." queryLog = sys.stdin if options.output: #if we've specified an output file outFile = open(options.output, "w") #write output there else: outFile = sys.stdout #or to stdout #read first line line = queryLog.readline() #init user and id dictionaries user_table = { } id_table = { } query_id = "0" user_name = "NO_SUCH_USER" #Search entry v2, group(1)=(None|Timestamp), group(2)=(ConnectionID), group(3)=(Connect|Query), group(4)=(UserName|QueryType) search_re = re.compile("([0-9]{6}[\s]+[0-9:]+)*[\s]+([0-9]+)\s(Connect|Query|Init DB|Prepare|Execute)[\s]+([a-zA-Z0-9]+)") #Custom regex support if options.regex: #try to import the user's regex. If it doesn't compile, it's bad and we disable the feature, notifying the user try: user_re = re.compile(options.regex) except: print >> sys.stderr, "Regex compilation failed. Skipping regex." options.regex = None #init time tracking first_date = None last_date = None #query_types = {'select': user_table[user_name].addSelect, 'update': user_table[user_name].addUpdate, 'insert': user_table[user_name].addInsert, '/regex/':} total_time = datetime.timedelta() #main parser loop while line: match = search_re.match(line) #match beginning of line if options.regex: user_match = user_re.search(line) #matches anywhere in the line, slow. else: user_match = None if match or user_match: #if we've got a useful entry if match and match.group(1): #and it's got a date group if first_date is None: #and we've never set a date before first_date = match.group(1) #set our first date first_date_parsed = MySqlTimestampToDateTime(first_date) #parse it time_delta = datetime.datetime.now() - first_date_parsed #see how long ago it was total_seconds = (time_delta.microseconds + (time_delta.seconds + time_delta.days * 24 * 3600) * 10**6) / 10**6 if options.verbose: #if we're in verbose mode, tell us how long ago it was print >> sys.stderr, "First timestamp at", first_date, "(", total_seconds / 3600, "hours,", total_seconds / 60 % 60 , "minutes,", total_seconds % 60, "seconds ago)" last_date = match.group(1) #set our last date if match and match.group(3) == "Connect": #if it's a connection query_id = match.group(2) #get the query id if query_id in id_table: #We have hit a SERIOUS problem. This likely means that mysql restarted. We're dumping the time and query_id lookup tables. total_time += MySqlTimestampToDateTime(last_date) - first_date_parsed id_table = { } #don't have to do the user table because that data in theory is still good (qps = total queries / total time) first_date = None last_date = None if not query_id in id_table: #if it's new (should be, but safety first) user_name = match.group(4) #set user_name id_table[query_id] = user_name #create the entry with user name as the value and the id as the index if not user_name in user_table: #if the user name is new (could be, could already exist user_table[user_name] = MySQLUser(user_name) #create a new counter class for it using the user name as the lookup key elif match and match.group(3) == "Query": #if this is a query query_id = match.group(2) #get the id try: user_name = id_table[query_id] #get the user name from our lookup table (the user who started it) except KeyError: user_name = "NO_SUCH_USER" if not user_name in user_table: user_table[user_name] = MySQLUser(user_name) query_type = match.group(4).lower() #get the type of query (select, insert, update, etc.) if query_type == "select": #correction from previous comment. this is not stupid. user_table[user_name].addSelect() elif query_type == "update": user_table[user_name].addUpdate() elif query_type == "insert": user_table[user_name].addInsert() elif match: #must be init db, prepare, or execute query_id = match.group(2) #get the id try: user_name = id_table[query_id] #get the user name from our lookup table (the user who started it) except KeyError: user_name = "NO_SUCH_USER" if not user_name in user_table: user_table[user_name] = MySQLUser(user_name) #print >> sys.stderr, "Init DB, Prepare, or Execute for ID", query_id, "user", user_name #if the user match is why we're here if user_match: try: query_id = match.group(2) #dirty trick. Try to get the ID, but what if the match wasn't a query and didn't match our regex? except: pass #we can re-use the last query_id, which hasn't been unset since the last matching Query! That makes the user_name likely to be the same as well, so we reuse it try: user_name = id_table[query_id] #get the user name from our lookup table (the user who started it) except KeyError: user_name = "NO_SUCH_USER" if not user_name in user_table: user_table[user_name] = MySQLUser(user_name) user_table[user_name].addRegex() if options.user and user_name == options.user: try: print >> outFile, line, except: sys.exit(0) line=queryLog.readline() #load the next line, if there is none (EOF) this returns None, which evals to false queryLog.close() #no longer require input if first_date and not options.user: #if we caught any timestamps at all and we're not in per-user mode #de-retarded this one. see function def above last_date_parsed = MySqlTimestampToDateTime(last_date) #how many seconds from beginning to end? total_time += last_date_parsed - first_date_parsed total_seconds = getTotalSeconds(total_time) #turns our dict into a list so it can be sorted by parameters in the values (in this case, selects) #see function above user_entry_list = sort_total(user_table.items()) if options.verbose: print >> sys.stderr, "UserName\tNumSel\tSelQps\tNumUpd\tUpdQps\tNumIns\tInsQps", if options.regex: print >> sys.stderr, "\tNumReg\tRegQps", print >> sys.stderr for user_entry in user_entry_list: #for all user entries user_name = user_entry[0] print >> outFile, user_name, #print user name if len(user_name) < 7: print >>outFile, "\t", print >> outFile, "\t%d\t%dqps" % (user_table[user_name].numSelect, user_table[user_name].numSelect / total_seconds), #print num queries and calculated qps for select print >> outFile, "\t%d\t%dqps" % (user_table[user_name].numUpdate, user_table[user_name].numUpdate / total_seconds), #update print >> outFile, "\t%d\t%dqps" % (user_table[user_name].numInsert, user_table[user_name].numInsert / total_seconds), #and inserts if options.regex: print >> outFile, "\t%d\t%dqps" % (user_table[user_name].numRegex, user_table[user_name].numRegex / total_seconds), print >> outFile elif not first_date: #no timestamps, must not be enough data or isn't a genqlog print >> sys.stderr, "There was not enough data to parse, please try a longer log file."