- Timestamp:
- 10/30/12 18:11:02 (12 years ago)
- Location:
- trunk
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/m4/missing-dev-prog.sh
r5 r27 5 5 * tarball with generated sources included. 6 6 MESSAGE 7 7 8 exit 1 -
trunk/slurm_drmaa/job.c
r26 r27 116 116 { 117 117 if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) { 118 fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); 119 } 118 int _slurm_errno = slurm_get_errno(); 119 120 if (_slurm_errno == ESLURM_INVALID_JOB_ID) { 121 self->on_missing(self); 122 } else { 123 fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id); 124 } 125 } 120 126 121 self->exit_status = job_info->job_array[0].exit_code; 122 fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); 123 124 switch(job_info->job_array[0].job_state) 125 { 127 switch(job_info->job_array[0].job_state & JOB_STATE_BASE) 128 { 129 fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason)); 130 126 131 case JOB_PENDING: 127 132 switch(job_info->job_array[0].state_reason) 128 133 { 129 case WAIT_NO_REASON: /* not set or job not pending */130 case WAIT_PRIORITY: /* higher priority jobs exist */131 case WAIT_DEPENDENCY: /* dependent job has not completed */132 case WAIT_RESOURCES: /* required resources not available */133 case WAIT_PART_NODE_LIMIT: /* request exceeds partition node limit */134 case WAIT_PART_TIME_LIMIT: /* request exceeds partition time limit */135 #if SLURM_VERSION_NUMBER < SLURM_VERSION_NUM(2,2,0)136 case WAIT_PART_STATE:137 #endif138 #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0)139 case WAIT_PART_DOWN: /* requested partition is down */140 case WAIT_PART_INACTIVE: /* requested partition is inactive */141 #endif142 self->state = DRMAA_PS_QUEUED_ACTIVE;143 break;144 134 #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0) 145 135 case WAIT_HELD_USER: /* job is held by user */ 146 147 self->state = DRMAA_PS_USER_ON_HOLD; 148 break; 149 case WAIT_HELD: /* job is held by administrator */ 150 self->state = DRMAA_PS_SYSTEM_ON_HOLD; 151 break; 152 #else 153 case WAIT_HELD: 136 fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD")); 154 137 self->state = DRMAA_PS_USER_ON_HOLD; 155 138 break; 156 139 #endif 157 case WAIT_TIME: /* job waiting for specific begin time */ 158 case WAIT_LICENSES: /* job is waiting for licenses */ 159 case WAIT_ASSOC_JOB_LIMIT: /* user/bank job limit reached */ 160 case WAIT_ASSOC_RESOURCE_LIMIT: /* user/bank resource limit reached */ 161 case WAIT_ASSOC_TIME_LIMIT: /* user/bank time limit reached */ 162 case WAIT_RESERVATION: /* reservation not available */ 163 case WAIT_NODE_NOT_AVAIL: /* required node is DOWN or DRAINED */ 164 #if SLURM_VERSION_NUMBER < SLURM_VERSION_NUM(2,2,0) 165 case WAIT_TBD1: 166 #else 167 case WAIT_QOS_THRES: /* required QOS threshold has been reached */ 168 #endif 169 #if SLURM_VERSION_NUMBER < SLURM_VERSION_NUM(2,3,0) 170 case WAIT_TBD2: 171 #else 172 #ifdef WAIT_FRONT_END 173 case WAIT_FRONT_END: /* Front end nodes are DOWN */ 174 #endif 175 case WAIT_QOS_JOB_LIMIT: /* QOS job limit reached */ 176 case WAIT_QOS_RESOURCE_LIMIT: /* QOS resource limit reached */ 177 case WAIT_QOS_TIME_LIMIT: /* QOS time limit reached */ 178 #endif 179 self->state = DRMAA_PS_QUEUED_ACTIVE; 180 break; 181 case FAIL_DOWN_PARTITION: /* partition for job is DOWN */ 182 case FAIL_DOWN_NODE: /* some node in the allocation failed */ 183 case FAIL_BAD_CONSTRAINTS: /* constraints can not be satisfied */ 184 case FAIL_SYSTEM: /* slurm system failure */ 185 case FAIL_LAUNCH: /* unable to launch job */ 186 case FAIL_EXIT_CODE: /* exit code was non-zero */ 187 case FAIL_TIMEOUT: /* reached end of time limit */ 188 case FAIL_INACTIVE_LIMIT: /* reached slurm InactiveLimit */ 189 #if SLURM_VERSION_NUMBER < SLURM_VERSION_NUM(2,2,0) 190 case FAIL_BANK_ACCOUNT: 191 #else 192 case FAIL_ACCOUNT: /* invalid account */ 193 case FAIL_QOS: /* invalid QOS */ 194 #endif 195 self->state = DRMAA_PS_FAILED; 140 case WAIT_HELD: /* job is held by administrator */ 141 fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD")); 142 self->state = DRMAA_PS_SYSTEM_ON_HOLD; 196 143 break; 197 144 default: 198 fsd_log_error(("job_state_reason = %d, assert(0)",job_info->job_array[0].state_reason)); 199 fsd_assert(false); 200 145 fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE")); 146 self->state = DRMAA_PS_QUEUED_ACTIVE; 201 147 } 202 148 break; 203 149 case JOB_RUNNING: 150 fsd_log_debug(("interpreting as DRMAA_PS_RUNNING")); 204 151 self->state = DRMAA_PS_RUNNING; 205 152 break; 206 153 case JOB_SUSPENDED: 207 if(slurm_self->user_suspended == true) 154 if(slurm_self->user_suspended == true) { 155 fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED")); 208 156 self->state = DRMAA_PS_USER_SUSPENDED; 209 else 210 self->state = DRMAA_PS_SYSTEM_SUSPENDED; /* assume SYSTEM - suspendig jobs is administrator only */ 157 } else { 158 fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED")); 159 self->state = DRMAA_PS_SYSTEM_SUSPENDED; 160 } 211 161 break; 212 162 case JOB_COMPLETE: 163 fsd_log_debug(("interpreting as DRMAA_PS_DONE")); 213 164 self->state = DRMAA_PS_DONE; 165 self->exit_status = job_info->job_array[0].exit_code; 166 fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); 214 167 break; 215 168 case JOB_CANCELLED: 169 fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)")); 170 self->state = DRMAA_PS_FAILED; 216 171 self->exit_status = -1; 217 172 case JOB_FAILED: 218 173 case JOB_TIMEOUT: 219 174 case JOB_NODE_FAIL: 175 #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0) 176 case JOB_PREEMPTED: 177 #endif 178 fsd_log_debug(("interpreting as DRMAA_PS_FAILED")); 220 179 self->state = DRMAA_PS_FAILED; 221 break; 222 default: /*transient states */ 223 if(job_info->job_array[0].job_state >= 0x8000) { 224 fsd_log_debug(("state COMPLETING")); 225 } 226 else if (job_info->job_array[0].job_state >= 0x4000) { 227 fsd_log_debug(("state Allocated nodes booting")); 228 } 229 else { 230 fsd_log_error(("job_state = %d, assert(0)",job_info->job_array[0].job_state)); 231 fsd_assert(false); 232 } 233 } 234 235 if(self->exit_status == -1) /* input,output,error path failure etc*/ 180 self->exit_status = job_info->job_array[0].exit_code; 181 fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); 182 break; 183 default: /*unknown state */ 184 fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state)); 185 } 186 187 if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) { 188 fsd_log_debug(("Epilog completing")); 189 } 190 191 if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) { 192 fsd_log_debug(("Nodes booting")); 193 } 194 195 if (self->exit_status == -1) /* input,output,error path failure etc*/ 236 196 self->state = DRMAA_PS_FAILED; 237 197 238 fsd_log_debug(("state: %d ,state_reason: %d-> %s", job_info->job_array[0].job_state, job_info->job_array[0].state_reason, drmaa_job_ps_to_str(self->state)));239 240 198 self->last_update_time = time(NULL); 241 199 242 if( self->state >= DRMAA_PS_DONE ) 200 if( self->state >= DRMAA_PS_DONE ) { 201 fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status))); 243 202 fsd_cond_broadcast( &self->status_cond ); 203 } 244 204 } 245 205 FINALLY … … 253 213 254 214 fsd_log_return(( "" )); 215 } 216 217 static void 218 slurmdrmaa_job_on_missing( fsd_job_t *self ) 219 { 220 221 fsd_log_enter(( "({job_id=%s})", self->job_id )); 222 fsd_log_warning(( "Job %s missing from DRM queue", self->job_id )); 223 224 fsd_log_info(( "job_on_missing: last job_ps: %s (0x%02x)", drmaa_job_ps_to_str(self->state), self->state)); 225 226 if( self->state >= DRMAA_PS_RUNNING ) { /*if the job ever entered running state assume finished */ 227 self->state = DRMAA_PS_DONE; 228 self->exit_status = 0; 229 } 230 else { 231 self->state = DRMAA_PS_FAILED; /* otherwise failed */ 232 self->exit_status = -1; 233 } 234 235 fsd_log_info(("job_on_missing evaluation result: state=%d exit_status=%d", self->state, self->exit_status)); 236 237 fsd_cond_broadcast( &self->status_cond); 238 fsd_cond_broadcast( &self->session->wait_condition ); 239 240 fsd_log_return(( "; job_ps=%s, exit_status=%d", drmaa_job_ps_to_str(self->state), self->exit_status )); 255 241 } 256 242 … … 265 251 self->super.control = slurmdrmaa_job_control; 266 252 self->super.update_status = slurmdrmaa_job_update_status; 253 self->super.on_missing = slurmdrmaa_job_on_missing; 267 254 self->old_priority = UINT32_MAX; 268 255 self->user_suspended = true; … … 433 420 } 434 421 435 436 422 /* propagate all environment variables from submission host */ 423 { 437 424 extern char **environ; 438 439 440 441 442 443 425 char **i; 426 unsigned j = 0; 427 428 for ( i = environ; *i; i++) { 429 job_desc->env_size++; 430 } 444 431 445 432 fsd_log_debug(("environ env_size = %d",job_desc->env_size)); 446 433 fsd_calloc(job_desc->environment, job_desc->env_size+1, char *); 447 434 448 for( i = environ; *i; i++,j++ ) 449 { 450 job_desc->environment[j] = fsd_strdup(*i); 451 } 452 453 } 435 for ( i = environ; *i; i++,j++ ) { 436 job_desc->environment[j] = fsd_strdup(*i); 437 } 438 } 454 439 455 440 /* environment */ … … 687 672 688 673 } 689 690 674 675 -
trunk/slurm_drmaa/job.h
r13 r27 43 43 }; 44 44 45 void slurmdrmaa_job_create_req(fsd_drmaa_session_t *session, const fsd_template_t *jt,fsd_environ_t **envp, job_desc_msg_t * job_desc,int n_job );46 void slurmdrmaa_job_create(fsd_drmaa_session_t *session, const fsd_template_t *jt,fsd_environ_t **envp,fsd_expand_drmaa_ph_t *expand, job_desc_msg_t * job_desc,int n_job);45 void slurmdrmaa_job_create_req(fsd_drmaa_session_t *session, const fsd_template_t *jt, fsd_environ_t **envp, job_desc_msg_t * job_desc, int n_job ); 46 void slurmdrmaa_job_create(fsd_drmaa_session_t *session, const fsd_template_t *jt, fsd_environ_t **envp, fsd_expand_drmaa_ph_t *expand, job_desc_msg_t * job_desc, int n_job); 47 47 48 48 #endif /* __SLURM_DRMAA__JOB_H */
Note: See TracChangeset
for help on using the changeset viewer.