[4] | 1 | ############################################################################ |
---|
| 2 | # |
---|
| 3 | # AppLauncher - Application Launcher for SAGE |
---|
| 4 | # Copyright (C) 2006 Electronic Visualization Laboratory, |
---|
| 5 | # University of Illinois at Chicago |
---|
| 6 | # |
---|
| 7 | # All rights reserved. |
---|
| 8 | # |
---|
| 9 | # Redistribution and use in source and binary forms, with or without |
---|
| 10 | # modification, are permitted provided that the following conditions are met: |
---|
| 11 | # |
---|
| 12 | # * Redistributions of source code must retain the above copyright |
---|
| 13 | # notice, this list of conditions and the following disclaimer. |
---|
| 14 | # * Redistributions in binary form must reproduce the above |
---|
| 15 | # copyright notice, this list of conditions and the following disclaimer |
---|
| 16 | # in the documentation and/or other materials provided with the distribution. |
---|
| 17 | # * Neither the name of the University of Illinois at Chicago nor |
---|
| 18 | # the names of its contributors may be used to endorse or promote |
---|
| 19 | # products derived from this software without specific prior written permission. |
---|
| 20 | # |
---|
| 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
---|
| 22 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
---|
| 23 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
---|
| 24 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
---|
| 25 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
---|
| 26 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
---|
| 27 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
---|
| 28 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
---|
| 29 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
---|
| 30 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
---|
| 31 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
| 32 | # |
---|
| 33 | # Direct questions, comments etc about AppLauncher to www.evl.uic.edu/cavern/forum |
---|
| 34 | # |
---|
| 35 | # Author: Ratko Jagodic |
---|
| 36 | # |
---|
| 37 | ############################################################################ |
---|
| 38 | |
---|
| 39 | |
---|
| 40 | import subprocess as sp |
---|
| 41 | import traceback as tb |
---|
| 42 | import os, sys, time, os.path |
---|
| 43 | from myprint import * # handles the printing or logging |
---|
| 44 | from threading import RLock, Thread |
---|
| 45 | |
---|
| 46 | opj = os.path.join |
---|
| 47 | |
---|
| 48 | ####################################################################### |
---|
| 49 | ##### NOT THREAD SAFE !!!!! |
---|
| 50 | ##### - needs fixing if multithreaded xmlrpc server is used |
---|
| 51 | ##### - otherwise the requests are queued and processed one by one |
---|
| 52 | ####################################################################### |
---|
| 53 | |
---|
| 54 | class CurrentRequests: |
---|
| 55 | def __init__(self, nodeHash): |
---|
| 56 | self._requests = {} #key = id, value = Request() |
---|
| 57 | self._nodeHash = nodeHash #key=IP, value=[0...n] --> how many apps are running on that node |
---|
| 58 | |
---|
| 59 | self.__submitThread = Thread(target=self.submitRequests) |
---|
| 60 | self.__requestsToSubmit = [] |
---|
| 61 | self.__submitLock = RLock() |
---|
| 62 | self.__doRunSubmitThread = True |
---|
| 63 | self.__submitThread.start() |
---|
| 64 | |
---|
| 65 | |
---|
| 66 | def __getFirstAvailableId(self): |
---|
| 67 | """ loops through the running requests until it finds an available id """ |
---|
| 68 | |
---|
| 69 | def inQueue(appId): |
---|
| 70 | for r in self.__requestsToSubmit: |
---|
| 71 | if r.config.getAppId() == appId: |
---|
| 72 | return True |
---|
| 73 | return False |
---|
| 74 | |
---|
| 75 | |
---|
| 76 | self.cleanup() #first clean up all of the dead requests and their appIds |
---|
| 77 | |
---|
| 78 | self.__submitLock.acquire() |
---|
| 79 | for i in range(0, 9999): |
---|
| 80 | if not self._requests.has_key(i) and not inQueue(i): |
---|
| 81 | self.__submitLock.release() |
---|
| 82 | return i |
---|
| 83 | self.__submitLock.release() |
---|
| 84 | |
---|
| 85 | |
---|
| 86 | def __getNextAvailableNode(self): |
---|
| 87 | ''' it loops through all the nodes of this cluster and returns the |
---|
| 88 | one that is running the fewest apps |
---|
| 89 | ''' |
---|
| 90 | currentMin = 99999 |
---|
| 91 | currentIP = None |
---|
| 92 | for ip, numApps in self._nodeHash.iteritems(): |
---|
| 93 | #if currentIP == None: currentIP=ip #initial case |
---|
| 94 | if numApps == 0: |
---|
| 95 | self._nodeHash[ip] = numApps+1 #increase the number of apps running on this node |
---|
| 96 | return ip |
---|
| 97 | elif numApps <= currentMin: #any machine with less than or equal to apps is a candidate |
---|
| 98 | currentMin = numApps |
---|
| 99 | currentIP = ip |
---|
| 100 | |
---|
| 101 | self._nodeHash[currentIP] = currentMin+1 #increase the number of apps running on this node |
---|
| 102 | return currentIP |
---|
| 103 | |
---|
| 104 | |
---|
| 105 | def getRequest(self, appId): |
---|
| 106 | """ get the request based on its appID (port number in fact) """ |
---|
| 107 | self.__submitLock.acquire() |
---|
| 108 | if self._requests.has_key(appId): |
---|
| 109 | r = self._requests[appId] |
---|
| 110 | self.__submitLock.release() |
---|
| 111 | return r |
---|
| 112 | else: |
---|
| 113 | self.__submitLock.release() |
---|
| 114 | return False |
---|
| 115 | |
---|
| 116 | |
---|
| 117 | def getStatus(self): |
---|
| 118 | """ returns the current app status as a hash of appNames keyed by appId """ |
---|
| 119 | |
---|
| 120 | self.cleanup() |
---|
| 121 | status = {} |
---|
| 122 | self.__submitLock.acquire() |
---|
| 123 | for appId, request in self._requests.iteritems(): |
---|
| 124 | status[str(appId)] = (request.config.getAppName(), request.command, request.targetMachine) |
---|
| 125 | |
---|
| 126 | self.__submitLock.release() |
---|
| 127 | return status |
---|
| 128 | |
---|
| 129 | |
---|
| 130 | def addRequest(self, config): |
---|
| 131 | """ modifies the appId parameter of the config and creates a new Request object """ |
---|
| 132 | |
---|
| 133 | # modify the appId parameter before writing the config to a file |
---|
| 134 | appId = self.__getFirstAvailableId() |
---|
| 135 | config.setAppId(appId) |
---|
| 136 | |
---|
| 137 | # if the app can be run on the nodes, get the machine with the lowest load (fewest apps running) |
---|
| 138 | if config.getRunOnNodes(): |
---|
| 139 | nodeIP = self.__getNextAvailableNode() |
---|
| 140 | config.setTargetMachine(nodeIP) |
---|
| 141 | WriteLog( "\nNODE IP = "+ nodeIP + str(self._nodeHash[nodeIP])+ "\n\n") |
---|
| 142 | else: |
---|
| 143 | if self._nodeHash.has_key( config.getTargetMachine() ): |
---|
| 144 | self._nodeHash[config.getTargetMachine()] += 1 |
---|
| 145 | |
---|
| 146 | # make the request |
---|
| 147 | request = SSHRequest(config) |
---|
| 148 | |
---|
| 149 | # submit it... in a separate thread |
---|
| 150 | self.__submitLock.acquire() |
---|
| 151 | self.__requestsToSubmit.append(request) |
---|
| 152 | self.__submitLock.release() |
---|
| 153 | |
---|
| 154 | return appId |
---|
| 155 | |
---|
| 156 | |
---|
| 157 | def submitRequests(self): |
---|
| 158 | while self.__doRunSubmitThread: |
---|
| 159 | |
---|
| 160 | if len(self.__requestsToSubmit) > 0: |
---|
| 161 | self.__submitLock.acquire() |
---|
| 162 | request = self.__requestsToSubmit.pop(0) |
---|
| 163 | self.__submitLock.release() |
---|
| 164 | |
---|
| 165 | res = request.submit() |
---|
| 166 | |
---|
| 167 | if res != -1: |
---|
| 168 | self.__submitLock.acquire() |
---|
| 169 | self._requests[ request.config.getAppId() ] = request |
---|
| 170 | self.__submitLock.release() |
---|
| 171 | |
---|
| 172 | time.sleep(1.5) |
---|
| 173 | |
---|
| 174 | |
---|
| 175 | def stopSubmitThread(self): |
---|
| 176 | self.__doRunSubmitThread = False |
---|
| 177 | |
---|
| 178 | |
---|
| 179 | def stopRequest(self, appId): |
---|
| 180 | """ stops the request forcefully """ |
---|
| 181 | |
---|
| 182 | self.__submitLock.acquire() |
---|
| 183 | |
---|
| 184 | if self._requests.has_key(appId): |
---|
| 185 | ret = self._requests[appId].kill() |
---|
| 186 | self.__submitLock.release() |
---|
| 187 | time.sleep(1) |
---|
| 188 | self.cleanup() |
---|
| 189 | |
---|
| 190 | return ret |
---|
| 191 | else: |
---|
| 192 | self.__submitLock.release() |
---|
| 193 | return False |
---|
| 194 | |
---|
| 195 | |
---|
| 196 | def cleanup(self): |
---|
| 197 | """ this runs every so often and checks whether the requests that we started are still alive |
---|
| 198 | if they are not alive, they are removed from the list of requests and their port (appId) is recycled |
---|
| 199 | """ |
---|
| 200 | self.__submitLock.acquire() |
---|
| 201 | |
---|
| 202 | for appId, request in self._requests.items(): |
---|
| 203 | if not request.isAlive(): |
---|
| 204 | WriteLog( ">>>> Cleaning up: " + request.config.getCommand() + " appId = " + str(appId) ) |
---|
| 205 | request.deletePIDFile() |
---|
| 206 | #if request.config.getRunOnNodes(): |
---|
| 207 | if self._nodeHash.has_key( request.config.getTargetMachine() ): |
---|
| 208 | self._nodeHash[ request.config.getTargetMachine() ] -= 1 #decrease the num of apps running on this node |
---|
| 209 | del self._requests[appId] |
---|
| 210 | |
---|
| 211 | self.__submitLock.release() |
---|
| 212 | |
---|
| 213 | |
---|
| 214 | |
---|
| 215 | class Request: |
---|
| 216 | def __init__(self, config): |
---|
| 217 | self.targetMachine = config.getTargetMachine() |
---|
| 218 | self.command = config.getCommand() |
---|
| 219 | self.configFilename = config.getConfigFilename() |
---|
| 220 | self.processObj = None # the object corresponding to the process we started |
---|
| 221 | self.config = config |
---|
| 222 | |
---|
| 223 | |
---|
| 224 | |
---|
| 225 | class SSHRequest(Request): |
---|
| 226 | |
---|
| 227 | def submit(self): |
---|
| 228 | # copy the configuration file over |
---|
| 229 | try: |
---|
| 230 | self.config.writeToFile() |
---|
| 231 | sp.call(["chmod", "g+w", self.config.getConfigFilename()]) #change the permissions of the temp file |
---|
| 232 | |
---|
| 233 | retcode = sp.call(["/usr/bin/scp", self.configFilename, self.targetMachine+":"+os.path.basename(self.configFilename)]) |
---|
| 234 | sp.call(["/usr/bin/ssh", "-x", self.targetMachine, "chmod a+rw "+self.configFilename]) |
---|
| 235 | except: |
---|
| 236 | WriteLog( "===> ERROR copying config file... application will use the default configuration:") |
---|
| 237 | WriteLog( "".join(tb.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])) ) |
---|
| 238 | |
---|
| 239 | # launch the application via ssh |
---|
| 240 | try: |
---|
| 241 | WriteLog( "\n\nRunning with command: /usr/bin/ssh -x " + self.targetMachine + " cd "+self.config.getBinDir()+" ;env DISPLAY=:0.0 "+ self.command) |
---|
| 242 | self.processObj = sp.Popen(["/usr/bin/ssh", "-x", self.targetMachine, "cd "+self.config.getBinDir(), ";env DISPLAY=:0.0 ", self.command]) |
---|
| 243 | WriteLog( ">>>> EXECUTING: " + self.command + "\nPID = " + str(self.processObj.pid) + "\n") |
---|
| 244 | except: |
---|
| 245 | WriteLog( "===> ERROR launching application ---------> :") |
---|
| 246 | WriteLog( "".join(tb.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])) ) |
---|
| 247 | return -1 |
---|
| 248 | |
---|
| 249 | return self.config.getAppId() |
---|
| 250 | |
---|
| 251 | |
---|
| 252 | def isAlive(self): |
---|
| 253 | """ returns true if the process is still alive """ |
---|
| 254 | return self.processObj.poll() is None |
---|
| 255 | |
---|
| 256 | |
---|
| 257 | def kill(self): |
---|
| 258 | # forcefully kill the application via ssh and delete its pid file if possible |
---|
| 259 | pidPath = opj(os.path.basename(self.configFilename), "pid") |
---|
| 260 | killCmd = "/bin/kill -9 `cat "+opj(pidPath, self.config.getAppName()+"_"+str(self.config.getAppId())+".pid")+"`" |
---|
| 261 | delCmd = "/bin/rm -rf "+opj(pidPath, self.config.getAppName()+"_"+str(self.config.getAppId())+".pid") |
---|
| 262 | try: |
---|
| 263 | retcode = sp.Popen(["/usr/bin/ssh", "-x", self.targetMachine, killCmd, ";", delCmd]) |
---|
| 264 | WriteLog( ">>>> KILLING: " + killCmd + "\nPID = " + str(self.processObj.pid) + "\n") |
---|
| 265 | except: |
---|
| 266 | WriteLog( "===> ERROR killing application ---------> :") |
---|
| 267 | WriteLog( "".join(tb.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])) ) |
---|
| 268 | return -1 |
---|
| 269 | |
---|
| 270 | return True |
---|
| 271 | |
---|
| 272 | |
---|
| 273 | def deletePIDFile(self): |
---|
| 274 | # delete the temp file where the app writes its pid |
---|
| 275 | pidPath = opj(os.path.basename(self.configFilename), "pid") |
---|
| 276 | delCmd = "/bin/rm -rf "+opj(pidPath, self.config.getAppName()+"_"+str(self.config.getAppId())+".pid") |
---|
| 277 | try: |
---|
| 278 | retcode = sp.Popen(["/usr/bin/ssh", "-x", self.targetMachine, delCmd], env={"DISPLAY": ":0.0"}) |
---|
| 279 | WriteLog(">>>> DELETING: " + delCmd + "\nPID = " + str(self.processObj.pid) + "\n") |
---|
| 280 | except: |
---|
| 281 | WriteLog("===> ERROR deleting temporary pid file ---------> :") |
---|
| 282 | WriteLog( "".join(tb.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])) ) |
---|
| 283 | # return False |
---|
| 284 | |
---|
| 285 | return True |
---|