Please note that the CVS and issue trackers have moved to GitHub. These Trac pages are no longer kept up-to-date.

root/seattle/trunk/deploymentscripts/testprocess.py@5637

Revision 2635, 13.9 KB (checked in by justinc, 10 years ago)

Improved the output of testprocess

Line 
1"""
2Author: Cosmin Barsan
3The rawsay function and its dependencies have been written by Justin Cappos,
4but the functions nmclient_rawcommunicate and nmclient_rawsay have been slightly modified to take ip and port arguments
5instead of the original handle argument. This was done to keep the code simple and the file short.
6The full functionality of signed requests is not needed for this test, since we are only using rawsay.
7
8Motivation:
9The purpose of this script is to be used as a component in an automated system that detects and fixes problems with the repy installation on linux machines. The purpose of this script is to detect problems with the node manager and software updater processes and report them.
10
11Description:
12This script verifies whether the node manager process is running and verifies whether the software updater is running.
13This script also verifies whether the ndoe manager si running on port 1224 by sending a getVessels request and waiting for a reply.
14This script will only work on linux.
15It is necessary that this script be run from a directory that contains the repyportability module and its dependencies.
16
17expected output:
18ProcessCheckerFinished
19
20If the software updater is not running the following is printed:
21'[SoftwareUpdater] Software Updater is not running.'
22
23If there are two or more Software Updater Processes running, the following is printed:
24'[SoftwareUpdater] Multiple instances of Software Updater are running.'
25
26If exactly one software updater process is running, but its memory usage is severely higher than expected, the following is printed:
27[SoftwareUpdater] Software Updater memory usage is unussually high.
28
29If exactly one software updater process is running, but its memory usage is severely lower than expected, the following is printed:
30[SoftwareUpdater] Software Updater memory usage is unussually low.
31
32If exactly one software updater process is running, but it is Stopped (T), Dead (X), or Defunct (Z), the following is printed:
33[SoftwareUpdater] Software Updater process is Stopped, Dead, or Defunct.
34
35If exactly one software updater process is running, but its cpu usage exceeds 50% the following is printed:
36[SoftwareUpdater] Software Updater process has cpu usage above 50%.
37
38If the node manager is not running, the following is printed:
39'[NodeManager] Node Manager is not running.'
40
41If there are two or more Node Manager Processes running, the following is printed:
42'[NodeManager] Multiple instances of Node Manager are running.'
43
44If exactly one node manager process is running, but its memory usage is severely higher than expected, the following is printed:
45[NodeManager] Node Manager memory usage is unussually high.
46
47If exactly one node manager process is running, but its memory usage is severely lower than expected, the following is printed:
48[NodeManager] Node Manager memory usage is unussually low.
49
50If exactly one node manager process is running, but it is Stopped (T), Dead (X), or Defunct (Z), the following is printed:
51[NodeManager] Node Manager process is Stopped, Dead, or Defunct.
52
53If exactly one node manager process is running, but its cpu usage exceeds 50% the following is printed:
54[NodeManager] Node Manager process has cpu usage above 50%.
55
56If the node manager is running, but is not responding to requests on port 1224, the following is printed:
57'[NodeManager] Node Manager is not responding to requests on port 1224.'
58
59Regardless of whether there are errors or not detected, ProcessCheckerFinished will be printed upon script termination.
60"""
61
62from repyportability import *
63import subprocess #subprocess is used to check running processes
64
65# Thrown when a failure occurs when trying to communicate with a node
66class NMClientException(Exception):
67  pass
68
69class SessionEOF(Exception):
70  pass
71
72sessionmaxdigits = 20
73
74def session_recvmessage(socketobj):
75
76  messagesizestring = ''
77  # first, read the number of characters...
78  for junkcount in range(sessionmaxdigits):
79    currentbyte = socketobj.recv(1)
80
81    if currentbyte == '\n':
82      break
83   
84    # not a valid digit
85    if currentbyte not in '0123456789' and messagesizestring != '' and currentbyte != '-':
86      raise ValueError, "Bad message size"
87     
88    messagesizestring = messagesizestring + currentbyte
89
90  else:
91    # too large
92    raise ValueError, "Bad message size"
93
94  messagesize = int(messagesizestring)
95 
96  # nothing to read...
97  if messagesize == 0:
98    return ''
99
100  # end of messages
101  if messagesize == -1:
102    raise SessionEOF, "Connection Closed"
103
104  if messagesize < 0:
105    raise ValueError, "Bad message size"
106
107  data = ''
108  while len(data) < messagesize:
109    chunk =  socketobj.recv(messagesize-len(data))
110    if chunk == '': 
111      raise SessionEOF, "Connection Closed"
112    data = data + chunk
113
114  return data
115 
116# a private helper function
117def session_sendhelper(socketobj,data):
118  sentlength = 0
119  # if I'm still missing some, continue to send (I could have used sendall
120  # instead but this isn't supported in repy currently)
121  while sentlength < len(data):
122    thissent = socketobj.send(data[sentlength:])
123    sentlength = sentlength + thissent
124   
125# send the message
126def session_sendmessage(socketobj,data):
127  header = str(len(data)) + '\n'
128  session_sendhelper(socketobj,header)
129
130  session_sendhelper(socketobj,data)
131 
132
133# Sends data to a node (opens the connection, writes the
134# communication header, sends all the data, receives the result, and returns
135# the result)...
136def nmclient_rawcommunicate(nmip, nmport, *args):
137
138  try:
139    thisconnobject = openconn(nmip, nmport) 
140  except Exception, e:
141    raise NMClientException, str(e)
142
143  # always close the connobject
144  try:
145
146    # send the args separated by '|' chars (as is expected by the node manager)
147    session_sendmessage(thisconnobject, '|'.join(args))
148    return session_recvmessage(thisconnobject)
149  except Exception, e:
150    raise NMClientException, str(e)
151  finally:
152    thisconnobject.close()
153
154
155# Public:  Use this for non-signed operations...
156def nmclient_rawsay(nmip, nmport, *args):
157  fullresponse = nmclient_rawcommunicate(nmip, nmport, *args)
158
159  try:
160    (response, status) = fullresponse.rsplit('\n',1)
161  except KeyError:
162    raise NMClientException, "Communication error '"+fullresponse+"'"
163
164  if status == 'Success':
165    return response
166  elif status == 'Error':
167    raise NMClientException, "Node Manager error '"+response+"'"
168  elif status == 'Warning':
169    raise NMClientException, "Node Manager warning '"+response+"'"
170  else:
171    raise NMClientException, "Unknown status '"+fullresponse+"'"
172
173
174####SOFTWARE UPDATER TESTS####
175
176#check if there is a software updater running
177ps = subprocess.Popen('ps -ef | grep "python softwareupdater.py" | grep -v grep', shell=True, stdout=subprocess.PIPE) 
178updater_out = ps.stdout.read()
179
180if(updater_out == ""):
181  print "[SoftwareUpdater] Software Updater is not running."
182
183
184
185#check if there are multiple software updater processes running
186updater_num = len(updater_out.splitlines())
187if (updater_num >1):
188  print "[SoftwareUpdater] Multiple instances of Software Updater are running."
189
190
191
192#check the memory usage of the software updater process. We only do this if there is a single instance of
193#software updater running.
194if (updater_num == 1):
195  #first get the process id from the output
196  updater_pid = (updater_out.split())[1]
197 
198  #get the memory usage for the process
199  for count in range(1,10):
200    # JAC: I've changed this from 'size' to 'rss' because this is more accurate
201    # for systems that don't have memory paged out (see: #468)
202    ps = subprocess.Popen('ps o pid,rss ' + str(updater_pid) + ' | grep -v PID', shell=True, stdout=subprocess.PIPE)
203    rawstring = ps.stdout.read()
204    #make sure we have at least two lines of output
205    if len(rawstring)>=2:
206      break
207     
208  if len(rawstring)<2:
209    raise Exception, "unexpected output from ps:" + str(rawstring)
210 
211  updater_mem = (rawstring.split())[1]
212
213  #check if the memory usage (in KB) is too large or to small, typical usage is about 4500KB
214  if (int(updater_mem) > 9000):
215    print "[SoftwareUpdater] Software Updater memory usage is unusually high."
216    print "[SoftwareUpdater] Software Updater memory usage is ("+str(updater_mem)+")"
217  elif (int(updater_mem) < 2000):
218    print "[SoftwareUpdater] Software Updater memory usage is unusually low."
219
220
221
222#check the state of the software updater process. We only do this if there is a single instance of
223#software updater running.
224if (updater_num == 1):
225  #first get the process id from the output
226  updater_pid = (updater_out.split())[1]
227 
228  #get the state code for the process
229  for count in range(1,10):
230    ps = subprocess.Popen('ps o pid,stat ' + str(updater_pid) + ' | grep -v PID', shell=True, stdout=subprocess.PIPE)
231    rawstring = ps.stdout.read()
232 
233    #make sure we have at least two lines of output
234    if len(rawstring)>=2:
235      break
236     
237  if len(rawstring)<2:
238    raise Exception, "unexpected output from ps:" + str(rawstring)
239   
240  rawcode = (rawstring.split())[1]
241
242  #we only care about the first character in the status code
243  updater_stat = rawcode[0]
244
245  #check if the state is Stopped (T), Dead (X), or Defunct (Z)
246  # CNB: I changed the third test from 'updater_stat == "T"' -
247  # I'm assuming this was a typo
248  if (updater_stat == "T" or updater_stat == "X" or updater_stat == "Z"):
249    print "[SoftwareUpdater] Software Updater process is Stopped, Dead, or Defunct."
250
251
252
253#check the cpu ussage of the software updater. We only do this if there is a single instance of
254#software updater running.
255if (updater_num == 1):
256  #first get the process id from the output
257  updater_pid = (updater_out.split())[1]
258 
259  #get the cpu usage for the process
260  for count in range(1,10):
261 
262    ps = subprocess.Popen('ps o pid,cp ' + str(updater_pid) + ' | grep -v PID', shell=True, stdout=subprocess.PIPE)
263    rawstring = ps.stdout.read()
264   
265    #make sure we have at least two lines of output
266    if len(rawstring)>=2:
267      break
268     
269  if len(rawstring)<2:
270    raise Exception, "unexpected output from ps:" + str(rawstring)
271   
272  updater_cpu = (rawstring.split())[1]
273
274  #check if the cpu usage exceeds 50% (if cp value is above 500)
275  if (int(updater_cpu) > 500):
276    print "[SoftwareUpdater] Software Updater process has cpu usage above 50%."
277
278
279
280####NODE MANAGER TESTS####
281
282#check if there is an instance of the node manager running
283ps = subprocess.Popen('ps -ef | grep "python nmmain.py" | grep -v grep', shell=True, stdout=subprocess.PIPE) 
284nm_out = ps.stdout.read()
285
286if(nm_out == ""):
287  print "[NodeManager] Node Manager is not running."
288
289
290
291#check if there are multiple node manager processes running
292nm_num = len(nm_out.splitlines())
293if (nm_num >1):
294  print "[NodeManager] Multiple instances of Node Manager are running."
295
296
297
298#check the memory usage of the software updater process. We only do this if there is a single instance of
299#software updater running.
300if (nm_num == 1):
301  #first get the process id from the output
302  nm_pid = (nm_out.split())[1]
303 
304  #get the memory usage for the process
305  for count in range(1,10):
306    ps = subprocess.Popen('ps o pid,size ' + str(nm_pid) + ' | grep -v PID', shell=True, stdout=subprocess.PIPE)
307    rawstring = ps.stdout.read()
308   
309    #make sure we have at least two lines of output
310    if len(rawstring)>=2:
311      break
312     
313  if len(rawstring)<2:
314    raise Exception, "unexpected output from ps:" + str(rawstring)
315   
316  nm_mem = (rawstring.split())[1]
317
318  #check if the memory usage (in KB) is too large or to small, typical usage is about 70000KB
319  if (int(nm_mem) > 130000):
320    print "[NodeManager] Node Manager memory usage is unusually high."
321  elif (int(nm_mem) < 20000):
322    print "[NodeManager] Node Manager memory usage is unusually low."
323
324
325
326#check the state of the node manager process. We only do this if there is a single instance of
327#node manager running.
328if (nm_num == 1):
329  #first get the process id from the output
330  nm_pid = (nm_out.split())[1]
331 
332  #get the state code for the process
333  for count in range(1,10):
334    ps = subprocess.Popen('ps o pid,stat ' + str(nm_pid) + ' | grep -v PID', shell=True, stdout=subprocess.PIPE)
335    rawstring = ps.stdout.read()
336   
337    #make sure we have at least two lines of output
338    if len(rawstring)>=2:
339      break
340     
341  if len(rawstring)<2:
342    raise Exception, "unexpected output from ps:" + str(rawstring)
343   
344  rawcode = (rawstring.split())[1]
345
346  #we only care about the first character in the status code
347  nm_stat = rawcode[0]
348
349  #check if the state is Stopped (T), Dead (X), or Defunct (Z)
350  # CNB: See my earlier edit, I think the original double
351  # 'nm_stat == "T"' was a typo.
352  if (nm_stat == "T" or nm_stat == "X" or nm_stat == "Z"):
353    print "[NodeManager] Node Manager process is Stopped, Dead, or Defunct."
354
355
356
357#check the cpu ussage of the node manager. We only do this if there is a single instance of
358#node manager running.
359if (nm_num == 1):
360  #first get the process id from the output
361  nm_pid = (nm_out.split())[1]
362 
363  #get the cpu usage for the process
364  for count in range(1,10):
365    ps = subprocess.Popen('ps o pid,cp ' + str(nm_pid) + ' | grep -v PID', shell=True, stdout=subprocess.PIPE)
366    rawstring = ps.stdout.read()
367
368    #make sure we have at least two lines of output
369    if len(rawstring)>=2:
370      break
371     
372  if len(rawstring)<2:
373    raise Exception, "unexpected output from ps:" + str(rawstring)
374   
375  nm_cpu = (rawstring.split())[1]
376
377  #check if the cpu usage exceeds 50% (if cp value is above 500)
378  if (int(nm_cpu) > 500):
379    print "[NodeManager] Node Manager process has cpu usage above 50%."
380
381
382
383#this is the section where we send a request to the node manager on port 1224 and see it if responds
384#we only want to do this in the case there is at least one instance of the node manager running
385if(nm_out != ""):
386
387  #try to send the request and log if there is a failure
388  try:
389    nmclient_rawsay(getmyip(), 1224, "GetVessels")
390   
391  except NMClientException, e:
392    #in the event of an exception, log the problem"
393    print "[NodeManager] When contacting the Node Manager, received error '"+str(e)+"'"
394   
395#print ProcessCheckerFinished regardless of whether there were any failures.
396print "ProcessCheckerFinished"
Note: See TracBrowser for help on using the browser.