Please note that the CVS and issue trackers have moved to GitHub. These Trac pages are no longer kept up-to-date.

root/seattle/trunk/deploymentscripts/build_stats.py@5637

Revision 2520, 13.9 KB (checked in by konp, 10 years ago)

Committed the deployment scripts, just missing complete readme. Moved old files into /old before deletion.

Line 
1"""
2<Program Name>
3  build_stats.py
4
5<Started>
6  June 2009
7
8<Author>
9  n2k8000@u.washington.edu
10  Konstantin Pik
11
12<Purpose>
13  The purpose of this file is to generate stats from the files that have been
14  gathered from various nodes by the deploy_* scripts.  This file does very
15  pretty simple string parsing in order to estimate stats, and as of right now
16  is currently superceded by make_summary.py.  The only portions of this file that
17  remain to be ported is the error grabbing code.
18
19
20<Usage>
21
22  python biuld_stats.py [timestamp]
23 
24  where [timestamp] is a unix time.  This'll tell the script which files to grab
25  as the fileformat is [fn].timestamp (see main()) for more info)
26 
27"""
28
29import subprocess
30import sys
31import os
32
33
34# The master controller log file
35controller_file = ''
36
37# The error file to write later
38err_file = ''
39
40# The summary file to write later
41summary_file = ''
42
43
44
45def helper_uniq(log_data, as_string = True):
46  """
47  <Purpose>
48    Helper that gets a bunch of lines from the log file, and then attemps to
49    count the unique hosts in those lines. Since failed hosts might have
50    several entries, we need to strip the date off first, and then count
51    the unique entries (done via set).
52   
53  <Arguments>
54    log_data:
55      Data from the logs - could be one line, or multiple lines.
56    as_string:
57      Default value is True.
58      True: return value as string
59      False: return value as integer
60   
61  <Exceptions>
62    None.
63
64  <Side Effects>
65    None.
66
67  <Returns>
68    String/Integer. Returns the # of uniq lines (with the date stripped).
69  """
70  log_data_nodate = strip_date(log_data)
71  # count the uniq lines in the output
72  log_data_set = set(log_data_nodate.splitlines())
73  if as_string:
74    return str(len(log_data_set)) 
75  else:
76    return len(log_data_set)
77
78   
79   
80def strip_date(log_data):
81  """
82  <Purpose>
83    Strips the date from the line so that we can count the number of 'uniq'
84    lines
85   
86    An input line would look something like this:
87    Jun 19 2009 02:18:53 | ERROR::  planet6.berkeley.intel-research.net: Trouble uploading deploy.tar
88   
89    the returned value would be:
90    ERROR::  planet6.berkeley.intel-research.net: Trouble uploading deploy.tar
91   
92  <Arguments>
93    log_data:
94      Data from the logs - could be one line, or multiple lines.
95   
96  <Exceptions>
97    None.
98
99  <Side Effects>
100    None.
101
102  <Returns>
103    Returns log_data without the date
104  """
105  # we'll build up this string here.
106  temp_to_return = []
107 
108  # for every line, find the | character and
109  for each_line in log_data.splitlines():
110    # get index of the | bar
111    bar_index = each_line.find('|')+1
112    # get everything with to the right of the bar
113    temp_to_return.append(each_line[bar_index:])
114  return "\n".join(temp_to_return)
115
116def shellexec(cmd_str):
117  """
118  <Purpose>
119    Uses subprocess to execute the command string in the shell.
120     
121  <Arguments>
122    cmd_str:  The string to be treated as a command (or set of commands,
123                separated by ;).
124   
125  <Exceptions>
126    None.
127
128  <Side Effects>
129    None.
130
131  <Returns>
132    A tuple containing (stdout, strerr, returncode)
133
134    Detailed:
135    stdout: stdout printed to console during command execution.
136    strerr: error (note: some programs print to strerr instead of stdout)
137    returncode: the return code of our call. If there are multiple commands,
138                then this is the return code of the last command executed.
139  """
140
141  # get a handle to the subprocess we're creating..
142  handle = subprocess.Popen(cmd_str, shell=True, 
143      stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
144
145  # execute and grab the stdout and err
146  stdoutdata, strerrdata = handle.communicate("")
147
148  # The return code... 
149  returncode = handle.returncode
150 
151  return stdoutdata, strerrdata, returncode
152 
153
154def get_uniq_machines():
155  # find out how many machines total we surveyed
156  # line looks like:
157  # returns an (int, HumanString)
158  # Jun 16 2009 01:56:07 | Setup:  Found 950 unique hosts to connect to.
159  global controller_file
160  out, err, retcode = shellexec("awk '/Found/ { print $8 } ' "+controller_file)
161  try:
162    out = out.strip('\n\r ')
163    print out
164    return (str(int(out)), 'There were '+out+' unique hosts surveyed\n\n')
165  except ValueError, ve:
166    print 'Unexpected number of uniq hosts returned from shell.'
167    print ve
168  except Exception, e:
169    print 'Error in get_uniq_machines()'
170    print e
171 
172
173def get_nodes_up():
174    # returns (nodes_up, HumanString)
175    # cheap way of seeing how many of the nodes our tests actually ran on..
176    # sum up the "versions", which is a unique line per host-log.  This can be slightly
177    # inaccurate
178  out, err, retcode = shellexec('grep ^version '+summary_file+\
179    ' | sort | uniq -c | awk \'{ print $1 }\'')
180  # each line starts with a number, so convert to int and give it a try
181  try:
182    # this is how many computers are 'up'
183    counter = 0
184    for line in out.splitlines():
185      counter += int(line)
186  except ValueError, e:
187    # ignore it, we don't really care
188    pass
189  except Exception, e:
190    print 'Error in get_nodes_up'
191    print e
192  finally:
193    return (counter, str(counter)+' hosts responded in a timely fashion '+\
194      'and ran our tests.\n\n')
195 
196 
197 
198def get_nodes_by_version():
199  # get the breakdown of hosts by version
200  out, err, retcode = shellexec('grep ^version '+summary_file+' | sort | uniq -c')
201  out_formatted = out.strip('\n\r ')
202 
203  # they keys are the versions, and they map to the number of that version currently running
204  node_dict = {}
205  return_string = "Breakdown of nodes by version:\n"+out+'\n'
206  # sample string that we're going to process looks like this:
207  # 2 version = "0.1e"
208  # so first, let's split by lines
209  out_array = out_formatted.splitlines()
210  for each_line in out_array:
211    if len(each_line.strip('\n\r ')) > 0:
212      # not a blank line, so split by spaces and then we care about the first
213      # and the last token (except we'll strip the double quotes off the last
214      # token so that we have just the version number
215      number_running = each_line.split(' ')[0]
216     
217      # this gives us ".01e", so let's now strip the quotes
218      node_version = each_line.split(' ')[-1]
219      node_version = each_line.strip('"')
220      try:
221        node_dict[node_version] = int(number_running)
222      except ValueError, ve:
223        print 'Invalid number of nodes, (NaN)'
224        print ve
225      except Exception, e:
226        print 'Error in get_nodes_by_version()'
227        print e
228  return (out_array, return_string)
229
230
231 
232 
233def get_unknown_files():
234  out, err, retcode = shellexec("grep Unknown file "+summary_file+" | sort | uniq -c")   
235  return_string = 'The following files are unrecognized and reside in the'+\
236    ' seattle folder\n'
237  # list of the files
238  file_list = []
239  if out.strip('\n\r '):
240    return_string += out
241    file_list = out.splitlines()
242  else:
243    return_string += '\tNone\n\n'
244  return (file_list, return_string)
245
246 
247 
248def get_num_python_errors():
249  # check to see if there were any python errors
250  out, err, retcode = shellexec("grep Traceback "+summary_file)
251  return_string = "Number of python errors (see summary.log for "+\
252    "additional details): "+str(len(out.splitlines()))+'\n'
253  return (len(out.splitlines()), return_string)
254
255 
256 
257def get_verifyprocess_summary():
258  # get all the testprocess info and dump that as well
259  out, err, retcode = shellexec("grep ^[[][^rI] "+summary_file+" | sort | uniq -c")
260  return_string = 'The following information is reported by verifyprocess.py:\n'
261  return_string += out+'\n'
262  return return_string
263
264 
265 
266def get_all_warnings():
267  # now lets check for warnings from nodemanager and software manager logs
268  out, err, retcode = shellexec("grep [[]WARN[]] "+summary_file+" | sort")
269  return_string = 'Warnings gathered from SU and NM logs:\n'
270  if out.strip('\n\r '):
271    return_string += out
272  else:
273    return_string += '\tNone\n'
274  return_string += '\n'
275  return return_string
276 
277 
278 
279def get_all_errors():
280  out, err, retcode = shellexec("grep '[[]ERROR[]]' "+summary_file+" | sort")
281  return_string = 'Errors gathered from SU and NM logs:\n'
282  if out.strip('\n\r '):
283    return_string += out
284  else:
285    return_string += '\tNone\n'
286  return_string += '\n'
287  return return_string
288 
289 
290 
291def main():
292  # sys.argv[1] holds the timestamp that we're intersted in.
293 
294  # by default it looks only ./detailed_logs directory, but otherwise it has
295  # special behavior and reprocesses old logs.
296 
297  global controller_file, err_file, summary_file
298  try:
299    timestamp = sys.argv[1]
300    if not os.path.isdir('./detailed_logs'):
301      print 'No files to look at! ./detailed_logs directory does not exist!'
302      return
303    file_handle = file('./detailed_logs/stats.'+timestamp, 'w')
304  except Exception, e:
305    print e
306    return
307  else:
308    # sys.argv[1] holds the timestamp
309    # the files we look at are
310    # ./detailed_logs/controller.[timestamp]
311    # ./detailed_logs/detailed.htmlsummary.[timestamp]
312    # ./deploy.err.[timestamp]
313   
314    # got a summary file handle. lets now dump data there!
315    controller_file = './detailed_logs/controller.'+timestamp
316    err_file = './detailed_logs/deploy.err.'+timestamp
317    summary_file = './detailed_logs/detailed.htmlsummary.'+timestamp
318   
319    # get the uniq machines list
320    num_uniq_machines, human_string =  get_uniq_machines()
321    file_handle.write(human_string)
322
323    # get the number of nodes up
324    num_nodes, human_string = get_nodes_up()
325    file_handle.write(human_string)
326   
327    # get the version breakdown of the nodes
328    version_dict, human_string = get_nodes_by_version()
329    file_handle.write(human_string)
330    file_handle.write(str(version_dict))
331   
332    # get a list of unknown files that reside in the directories
333    file_list, human_string = get_unknown_files()
334    file_handle.write(human_string)
335   
336    # get the number of python errors
337    number_of_errors, human_string = get_num_python_errors()
338    file_handle.write(human_string)
339
340    # get the summary of the verifyprocess script
341    summary_string = get_verifyprocess_summary()
342    file_handle.write(summary_string)
343   
344    # get all the warnings from the v2 logs
345    warning_strings = get_all_warnings()
346    file_handle.write(warning_strings)
347
348    # get all the errors from the v2 logs
349    error_strings = get_all_errors()
350    file_handle.write(error_strings)
351
352   
353    # find out all the machines that denied us entry and why
354    # divided by some as that failure would cause the host to be put on the
355    # 'retry' list of failed hosts, so we don't want to list duplicate hosts
356    # how many rejected our key?
357    out, err, retcode = shellexec("grep \"[^ ]Permission denied (\" "+err_file)
358    out = out.strip('\n\r ')
359    if out:
360      uniq_hostnames = helper_uniq(out)
361      file_handle.write('\nThere were '+uniq_hostnames+\
362        ' machines who rejected our key')
363    else:
364      file_handle.write('\nAll computers accepted our key')
365    file_handle.write('\n')
366   
367    # how many gave us a connection refused?
368    out, err, retcode = shellexec("grep \"Connection refused\" "+err_file)
369    out = out.strip('\n\r ')
370    if out:
371      uniq_hostnames = helper_uniq(out)
372      file_handle.write('\nThere were '+uniq_hostnames+\
373        ' machines who refused connection')
374    else:
375      file_handle.write('\nNo nodes gave us a connection refused error')
376    file_handle.write('\n')
377
378    # Connection timed out?
379    out, err, retcode = shellexec("grep \"Connection timed out\" "+err_file)
380    out = out.strip('\n\r ')
381    if out:
382      uniq_hostnames = helper_uniq(out)
383      file_handle.write('\nThere were '+uniq_hostnames+\
384        ' machines on which the connection timed out')
385    else:
386      file_handle.write('\nNo connections timed out.')
387    file_handle.write('\n')
388
389    # Name resolution error (temporary)?
390    out, err, retcode = shellexec("grep \"Temporary failure in name resolution\" "+err_file)
391    out = out.strip('\n\r ')
392    if out:
393      uniq_hostnames = helper_uniq(out)
394      file_handle.write('\nThere were '+uniq_hostnames+\
395        ' machines on which there was Temporary failure in name resolution')
396    else:
397      file_handle.write('\nNo temporary failures in name resolution.')
398    file_handle.write('\n')
399
400    # hostname could not be resolved?
401    out, err, retcode = shellexec("grep \"Could not resolve hostname\" "+err_file)
402    out = out.strip('\n\r ')
403    if out:
404      file_handle.write('\nThere were '+uniq_hostnames+\
405        ' machines on which the hostname could not be resolved')
406    else:
407      file_handle.write('\nNo name resolution problems.')
408    file_handle.write('\n')
409   
410    # counts the "WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED"
411    out, err, retcode = shellexec("grep \"REMOTE HOST IDENTIFICATION HAS CHANGED\" "+err_file)
412    out = out.strip('\n\r ')
413    if out:
414      uniq_hostnames = helper_uniq(out)
415      file_handle.write('\nThere were '+uniq_hostnames+\
416        ' machines that changed their key and ssh complained')
417    else:
418      pass
419    file_handle.write('\n')   
420   
421    # this line makes a list of all of the machines that don't have seattle installed
422    # sample line to be processed will look like this:
423    # Jun 16 2009 09:16:18 | planetlab2.s3.kth.se: Error: Did not find any seattle installs on planetlab2.s3.kth.se. Aborting.(logdir: )
424    out, err, retcode = shellexec("awk '/Did not/ { sub(\":\",\"\", $6); print $6 } ' "+summary_file)
425    # each line has an IP, so lets count
426    no_seattle = out.splitlines()
427    file_handle.write('The following machines ('+str(len(no_seattle))+\
428      ') do not have seattle installed:\n')
429    if out.strip('\n\r '):
430      file_handle.write('(also written to missing.list)\n')
431      file_handle.write(out)
432      try:
433        missing_handle = file('missing.list.'+timestamp, 'w')
434        missing_handle.write(out)
435        missing_handle.close()
436      except Exception, e:
437        print "Trouble writing missing file's list"
438    else:
439      file_handle.write('\tNone\n')
440    file_handle.write('\n')
441   
442    file_handle.write('End of log\n')
443    file_handle.close()
444    out, err, retcode = shellexec('cat ./detailed_logs/stats.'+timestamp)
445    print out
446   
447if __name__ == "__main__":
448  main()
Note: See TracBrowser for help on using the browser.