Please note that the CVS and issue trackers have moved to GitHub. These Trac pages are no longer kept up-to-date.

root/seattle/trunk/deploymentscripts/deploy_network.py@5637

Revision 2810, 17.6 KB (checked in by konp, 10 years ago)

Fixed high cpu usage, added summary tool, a number of other fixes and improvements as well. Note: blackbox node keys have not been added yet.

Line 
1"""
2<Program Name>
3  deploy_network.py
4
5<Started>
6  May 2009
7
8<Author>
9  n2k8000@u.washington.edu
10  Konstantin Pik
11
12<Purpose>
13  This is the file contains network-related functions that are used by the
14  deployment script.  This file is not to be executed by itself, but is to
15  be used with deploy_*.
16
17<Usage>
18  See deploy_main.py.
19 
20"""
21import os
22import subprocess
23import thread
24import time
25
26import deploy_main
27import deploy_logging
28import deploy_threading
29import deploy_helper
30
31
32# the default connection timeout time (will increase by 25% after each
33# failed run
34default_connection_timeout = '60'
35
36
37def remote_runscript(user, remote_host, custom_script_name = '', run_custom_only = False, tar_filename = 'deploy.tar'):
38  """
39  <Purpose>
40    This function connects to user@remote_host using the specified rsa key.
41    Typically, only the user and remote_host fields need to be changed - the
42    RSA key used is the user's default RSA key.  After connecting, this will
43    extract runlocaltests.py from the tar_filename and execute it remotely.
44     
45  <Arguments>
46    user:
47      the user to log in as on the remote machine.
48    remote_host:
49      the remote machine's IP to which we'll be uploading files.
50    custom_script_name:
51      the name(s) as strings of custom scripts to run on the remote machine.
52    run_custom_only:
53      Should we only run the custom script?
54    tar_filename:
55      Optional. Default is deploy.tar. The tar file to upload to the remote
56        host.
57   
58  <Exceptions>
59    None.
60
61  <Side Effects>
62    Blocks current thread until remote script is done executing.
63
64  <Returns>
65    No returns.
66  """
67
68  # We will build up our list of commands as a list of strings, then
69  # join with the ';' to specifify multiple commands on one line.
70  cmd_list = []
71
72  # extract out remote setup script that'll figure out where the files need to
73  # be extracted to
74  cmd_list.append('tar -xf '+tar_filename+' runlocaltests.py')
75
76
77  # The following lines execute the script remotely
78  # IMPORTANT - SEE runlocaltests.py for args
79  # if verbose flag is set, make the runlocaltests.py script also verbose
80 
81  if run_custom_only:
82    cmd_list.append('python runlocaltests.py '+remote_host+' -v '+\
83        str(deploy_main.verbosity)+' '+custom_script_name+' 1')
84  else: # tell the script we're ONLY running the custom files
85    cmd_list.append('python runlocaltests.py '+remote_host+' -v '+\
86        str(deploy_main.verbosity)+' '+custom_script_name)
87
88  # Create the one string that we'll execute, join with '; '
89  command_string = '; '.join(cmd_list)
90
91  # use helper to execute command_string on user@remote_host
92  out, err, returncode = remote_shellexec(command_string, user, remote_host)
93
94  deploy_logging.print_to_log(remote_host, out, err, returncode)
95
96  # Once we're done executing the remote script, lets grab the logs
97  remote_get_log(user, remote_host)
98
99
100
101def remote_get_log(user, remote_host):
102  """
103  <Purpose>
104    Gets the remote logs (all tarred up) from remote_host and copies it to a
105    local directory via scp then untars it into deploy.logs/[remote_host]/.
106
107  <Arguments>
108    user:
109      the user to log in as
110    remote_host:
111      the IP of the host to get the logs from
112   
113  <Exceptions>
114    scp fails/times out.
115
116  <Side Effects>
117    None.
118
119  <Returns>
120    No returns.
121  """
122
123  try:
124    # set up dir that we'll move the remote .tar into
125    if not os.path.isdir('./deploy.logs/'+remote_host):
126      os.mkdir('./deploy.logs/'+remote_host)
127   
128    # download the tar file from remote host
129    out, err, returncode = remote_download_file(remote_host+'.tgz', 
130        './deploy.logs/'+remote_host+'/'+remote_host+'.tgz', user, remote_host)
131
132    deploy_logging.log('Downloading logs', 'Logs downloaded from '+remote_host)
133    # now try to untar the files
134
135    # build up a command list to execute
136    command_list = []
137
138    # tar is picky about where it'll unzip to (CWD), so we'll just Cd there
139    command_list.append('cd ./deploy.logs/'+remote_host+'/')
140
141    # now untar. if deploy_main.verbosity >=1 then we'll be verbose
142    if deploy_main.verbosity >=1:
143      command_list.append('tar -xvvf '+remote_host+'.tgz')
144    else:
145      command_list.append('tar -xf '+remote_host+'.tgz')
146
147    # not make command string by joining the list elements with '; ' 
148    command_string = '; '.join(command_list)
149
150    # execute string
151    out, err, retvalue = deploy_helper.shellexec2(command_string)
152
153    deploy_logging.log('Downloading logs', 'Logs from '+remote_host+' are ready')
154
155    # we no longer need the tar file, just hogging up space
156    os.remove('./deploy.logs/'+remote_host+'/'+remote_host+'.tgz')
157
158  except Exception, e:
159    if deploy_main.verbosity == 2:
160      # Only log if we error and need to narrow this down. otherwise,
161      # it gets really spammy.   
162      deploy_logging.logerror(remote_host+": Some kind of err in remote_get_log. ("+\
163          remote_host+") , error:"+str(e)+")")
164  return
165
166
167
168def remote_runcleanup(user, remote_host):
169  """
170  <Purpose>
171    This function connects to the remote computer and executes the
172    cleanup/setup script.
173
174  <Arguments>
175    user:
176      the user to log in as
177    remote_host:
178      the IP of the host to get the logs from
179
180  <Exceptions>
181    None.
182
183  <Side Effects>
184    None.
185
186  <Returns>
187    None.
188  """
189
190  # build up the command list that we'll execute on the remote host
191  cmd_list = []
192
193  # extract the cleanup script
194  cmd_list.append('tar -xf deploy.tar cleanup_deploy.py')
195 
196  # execute the script
197  cmd_list.append('python cleanup_deploy.py '+remote_host)
198
199  # join the list together with '; ' between the entries
200  cmd_str = '; '.join(cmd_list)
201 
202  # our handle to the session
203  out, err, returncode = remote_shellexec(cmd_str, user, remote_host)
204
205  deploy_logging.print_to_log('Cleanup/Setup', out, err, returncode)
206   
207
208
209def remote_shellexec(command_string, user, remote_host, retry_on_refusal = 3, connect_timeout = default_connection_timeout):
210  """
211  <Purpose>
212    This uses ssh to execute the command_string on user@remote_host.
213     
214  <Arguments>
215    command_string:
216      the command string we'll execute on the remote machine. Commands are
217      executed sequentially.
218    user:
219      user to log in as
220    remote_host:
221      the ip/name of the machine we're connecting to.
222    retry_on_refusal:
223      Optional. Integer. Has number of times to retry the connection IF it was
224      refused (built in to take care of not 'spamming' the remote server)
225    connect_timeout:
226      Optional. String. Time in seconds for ssh to timeout if no response was
227      received.
228   
229  <Exceptions>
230    None.
231
232  <Side Effects>
233    None.
234
235  <Returns>
236    Tuple. (out, err, returncode)
237    Details:
238      out: stdout from ssh
239      err: err from ssh
240      returncode: ssh's exit code
241  """
242
243  # execute the string on the remote computer by sshing
244
245  # ssh_proc is the handle to our ssh session process
246  # -T is needed because otherwise you get a weird error from ssh (even though
247  # everything executes flawlessly. -T specifies not allocate a tty (which
248  # is fine for our purposes. -i specifies rsa priv key file path
249  # StrictHostKeyChecking=no tells ssh to connect to the remote host even if
250  # the remote host's ip is not trusted (cached) in known_hosts file.
251 
252  ssh_proc_handle = subprocess.Popen('ssh -T -o BatchMode=yes -o ConnectTimeout='+\
253      str(connect_timeout)+' -o StrictHostKeyChecking=no '\
254      ' '+user+'@'+remote_host, shell=True, stdin=subprocess.PIPE,
255      stdout=subprocess.PIPE, stderr=subprocess.PIPE)
256
257  # get the process ID
258  ssh_proc_pid = ssh_proc_handle.pid
259
260  # start thread to monitor timeouts (on another thread)
261  deploy_threading.monitor_timeout(ssh_proc_pid, int(connect_timeout), remote_host, user)
262
263  # execute string and block this thread until done...
264  out, err = ssh_proc_handle.communicate(command_string)
265
266  returncode = ssh_proc_handle.returncode
267
268  # retry if conn. was refused? (if we have retries left)
269  if retry_on_refusal:
270    # check if we got a connection refused. if we did, could be cuz we're
271    # spamming the server, so sleep and then try again
272    didwesleep = sleep_on_conn_refused(out, err, retry_on_refusal, remote_host)
273    # we slept, so call function again and try to execute
274    if didwesleep:
275      # run again, but this time decrement retry counter
276      out, err, returncode = remote_shellexec(command_string, user, 
277          remote_host, retry_on_refusal - 1, connect_timeout)
278
279  # format the string
280  out, err = deploy_logging.format_stdout_and_err(out, err)
281
282  return out, err, returncode
283
284
285
286def remote_download_dir(remote_source_dir, local_dest_dir, user, remote_host, retry_on_refusal = 3, connect_timeout = default_connection_timeout):
287  """
288  <Purpose>
289    This uses scp to download a directory from a remote computer.
290     
291  <Arguments>
292    remote_source_dir:
293      The path to the directory to download (remote directory)
294    local_dest_dir:
295      Where do we put it on this computer?
296    user:
297      user to log in as
298    remote_host:
299      the ip/name of the machine we're connecting to.
300    retry_on_refusal:
301      Optional. Integer. Has number of times to retry the connection IF it was
302      refused (built in to take care of not 'spamming' the remote server)
303    connect_timeout:
304      Optional. Integer. Time in seconds for ssh to timeout if no response was
305      received.
306   
307  <Exceptions>
308    None.
309
310  <Side Effects>
311    None.
312
313  <Returns>
314    Tuple. (out, err, returncode)
315    Details:
316      out: stdout from scp
317      err: err from ssh
318      returncode: scp's exit code
319  """
320  # the dir one level 'up' from the our destination dir must exist, so lets
321  # grab it by doing some string math.. remove trailing . and then partition
322  local_dest_dir_parent, junk, morejunk = local_dest_dir.strip('/').rpartition('/') 
323
324  # if our local destination directory does not exist then complain.
325  if not os.path.isdir(local_dest_dir_parent):
326    deploy_logging.logerror(local_dest_dir)
327    deploy_logging.logerror(local_dest_dir_parent)
328    deploy_logging.logerror('Problem with local directory: it does not exist!')
329    raise Exception('Please check calling method.')
330
331  # get the scp handle
332  scp_proc_handle = subprocess.Popen('scp -r -o BatchMode=yes -o '+
333      'ConnectTimeout='+str(connect_timeout)+' -o StrictHostKeyChecking=no '+\
334      user+'@'+remote_host+':'+remote_source_dir+\
335      ' '+local_dest_dir, shell = True, stdout = subprocess.PIPE, 
336      stderr = subprocess.PIPE) 
337   
338  # the pid of the scp process just started
339  scp_proc_pid = scp_proc_handle.pid
340
341  # start thread to monitor timeouts (on another thread)
342  deploy_threading.monitor_timeout(scp_proc_pid, int(connect_timeout), remote_host, user)
343
344  # execute string and block this thread until done...
345  out, err = scp_proc_handle.communicate('')
346
347  returncode = scp_proc_handle.returncode
348
349  # retry if conn. was refused?
350  if retry_on_refusal:
351    # check if we got a connection refused. if we did, could be cuz we're
352    # spamming the server, so sleep and then try again
353    didwesleep = sleep_on_conn_refused(out, err, retry_on_refusal, remote_host)
354    # we slept, so call function again and try to execute
355    if didwesleep:
356      # run again, but this time decrement retry counter
357      out, err, returncode = remote_download_dir(remote_source_dir, 
358          local_dest_dir, user, remote_host, retry_on_refusal - 1, 
359          connect_timeout = default_connection_timeout)
360
361  # format the string
362  out, err = deploy_logging.format_stdout_and_err(out, err)
363
364  return out, err, returncode
365
366
367
368def remote_download_file(remote_fn_path, local_fn_path, user, remote_host, retry_on_refusal = 3, connect_timeout = default_connection_timeout):
369  """
370  <Purpose>
371    This uses scp to download a file from a remote computer.
372     
373  <Arguments>
374    remote_fn_path:
375      The path to the file to download (remote file)
376    local_fn_path:
377      Where do we put it on this computer?
378    user:
379      user to log in as
380    remote_host:
381      the ip/name of the machine we're connecting to.
382    retry_on_refusal:
383      Optional. Integer. Has number of times to retry the connection IF it was
384      refused (built in to take care of not 'spamming' the remote server)
385    connect_timeout:
386      Optional. Integer. Time in seconds for ssh to timeout if no response was
387      received.
388   
389  <Exceptions>
390    None.
391
392  <Side Effects>
393    None.
394
395  <Returns>
396    Tuple. (out, err, returncode)
397    Details:
398      out: stdout from scp
399      err: err from ssh
400      returncode: scp's exit code
401  """
402  # local_fn_path will have the path + name of file
403
404  # get the fn by doing some string math..
405  dir_to_local_file, junk, localfn = local_fn_path.rpartition('/') 
406
407  # is the dir real?
408  if not os.path.isdir(dir_to_local_file):
409    deploy_logging.logerror('Local destination directory does not exist.')
410    raise Exception('Please check calling method.')
411
412  # the SCP handle used
413  scp_proc_handle = subprocess.Popen('scp -o BatchMode=yes -o '+\
414      'ConnectTimeout='+str(connect_timeout)+' -o StrictHostKeyChecking=no '+\
415      ' '+user+'@'+remote_host+':'+remote_fn_path+\
416      ' '+local_fn_path, shell = True, stdout = subprocess.PIPE, 
417      stderr = subprocess.PIPE)   
418 
419  # set the PID of the process so we can set a timeout later
420  scp_proc_pid = scp_proc_handle.pid
421
422  # start thread to monitor timeouts (on another thread)
423  deploy_threading.monitor_timeout(scp_proc_pid, int(connect_timeout), remote_host, user)
424
425  # execute
426  out, err = scp_proc_handle.communicate('')
427
428  returncode = scp_proc_handle.returncode
429
430  # retry if conn. was refused?
431  if retry_on_refusal:
432    # check if we got a connection refused. if we did, could be cuz we're spamming
433    # the server, so sleep and then try again
434    didwesleep = sleep_on_conn_refused(out, err, retry_on_refusal, remote_host)
435    # we slept, so call function again and try to execute
436    if didwesleep:
437      # run again, but this time decrement retry counter
438      out, err, returncode = remote_download_file(remote_fn_path, 
439          local_fn_path, user, remote_host, retry_on_refusal - 1, 
440          connect_timeout = default_connection_timeout)
441
442  # format the string
443  out, err = deploy_logging.format_stdout_and_err(out, err)
444
445  return out, err, returncode
446
447
448
449def remote_upload_file(local_fn_path, user, remote_host, retry_on_refusal = 3, connect_timeout = default_connection_timeout):
450  """
451  <Purpose>
452    This uses scp to upload a file to a remote computer.
453     
454  <Arguments>
455    local_fn_path:
456      Which file do we chuck to the remote computer?
457    user:
458      user to log in as
459    remote_host:
460      the ip/name of the machine we're connecting to.
461    retry_on_refusal:
462      Optional. Integer. Has number of times to retry the connection IF it was
463      refused (built in to take care of not 'spamming' the remote server)
464    connect_timeout:
465      Optional. Integer. Time in seconds for ssh to timeout if no response was
466      received.
467   
468  <Exceptions>
469    None.
470
471  <Side Effects>
472    None.
473
474  <Returns>
475    Tuple. (out, err, returncode)
476    Details:
477      out: stdout from scp
478      err: err from ssh
479      returncode: scp's exit code
480  """
481
482  # check that local file exists.
483  if not os.path.isfile(local_fn_path):
484    deploy_logging.logerror('Problem with local file: it does not exist!')
485    raise Exception('Please check calling method.')
486 
487  scp_proc_handle = subprocess.Popen('scp -o BatchMode=yes -o '+\
488      'ConnectTimeout='+str(connect_timeout)+' -o StrictHostKeyChecking=no '+\
489      ' '+local_fn_path+' '+user+"@"+remote_host+":", shell = True, 
490      stdout = subprocess.PIPE, stderr = subprocess.PIPE)
491
492  scp_proc_pid = scp_proc_handle.pid
493
494  # start thread to monitor timeouts (on another thread)
495  deploy_threading.monitor_timeout(scp_proc_pid, int(connect_timeout), remote_host, user)
496
497  # execute and block until done...
498  out, err = scp_proc_handle.communicate('')
499
500  returncode = scp_proc_handle.returncode
501
502  # retry if conn. was refused?
503  if retry_on_refusal:
504    # check if we got a connection refused. if we did, could be cuz we're
505    # spamming the server, so sleep and then try again
506    didwesleep = sleep_on_conn_refused(out, err, retry_on_refusal, remote_host)
507    # we slept, so call function again and try to execute
508    if didwesleep:
509      # run again, but this time decrement retry counter
510      out, err, returncode = remote_upload_file(local_fn_path, user, 
511          remote_host, retry_on_refusal - 1, connect_timeout = default_connection_timeout)
512
513  # format the string
514  out, err = deploy_logging.format_stdout_and_err(out, err)
515
516  return out, err, returncode
517
518 
519 
520def sleep_on_conn_refused(out, err, timesleft, remote_host):
521  """
522  <Purpose>
523    passed in stdout/stderr from ssh/scp, it checks if we had a refused
524    connection, and then returns true if we must retry it or not.
525   
526    Divides 60 seconds by how many times we have left to sleep.
527    So if we retry 3 times...
528      1st run: sleep 60/3 (20s)
529      2nd run: sleep 60/2 (30s)
530      3rd run: sleep 60/1 (60s)
531
532    As you can see, the timeout increases.
533
534  <Arguments>
535    out:
536      the stdout
537    err:
538      the stderr
539    timesleft:
540      how many times do we have left to try and connect.
541
542  <Exceptions>
543    None.
544
545  <Side Effects>
546    None.
547
548  <Returns>
549    Boolean. True if we did a sleep, False if we didn't.
550  """
551 
552 
553  # checks if out/err have 'connection refused' string and waits to
554  # overcome timeout
555  out_bool = out.lower().find('connection refused') > -1
556  err_bool = err.lower().find('connection refused') > -1
557  instructional_machine = '128.' in remote_host
558  if instructional_machine: 
559    if out_bool or err_bool:
560      # sleep then try again
561      deploy_logging.log('WARNING', "Connection refused, forced sleeping to overcome "+\
562          "timeout ("+str(timesleft)+" timeouts left)")
563      time.sleep(60/timesleft) # each time you sleep a little longer
564      return True
565  return False
Note: See TracBrowser for help on using the browser.