- Timestamp:
- 10/11/11 18:17:43 (13 years ago)
- Location:
- trunk/pbs_drmaa
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/pbs_drmaa/job.c
r22 r25 207 207 if( status == NULL ) 208 208 { 209 if(pbsdrmaa_job_update_status_accounting(self) == false) 210 { 211 #ifndef PBS_PROFESSIONAL 212 fsd_log_error(("pbs_statjob error: %d, %s, %s", pbs_errno, pbse_to_txt(pbs_errno), pbs_strerror(pbs_errno))); 213 #else 214 # ifndef PBS_PROFESSIONAL_NO_LOG 215 fsd_log_error(("pbs_statjob error: %d, %s", pbs_errno, pbse_to_txt(pbs_errno))); 216 # else 217 fsd_log_error(("pbs_statjob error: %d", pbs_errno)); 218 # endif 219 #endif 220 221 /**/ 222 223 switch( pbs_errno ) 224 { 225 case PBSE_UNKJOBID: 226 break; 227 case PBSE_PROTOCOL: 228 case PBSE_EXPIRED: 229 if ( session->pbs_conn >= 0 ) 230 pbs_disconnect( session->pbs_conn ); 231 sleep(1); 232 session->pbs_conn = pbs_connect( session->super.contact ); 233 if( session->pbs_conn < 0 ) 234 pbsdrmaa_exc_raise_pbs( "pbs_connect" ); 235 else 236 { 237 fsd_log_error(("retry:")); 238 goto retry; 239 } 240 default: 241 pbsdrmaa_exc_raise_pbs( "pbs_statjob" ); 242 break; 243 case 0: /* ? */ 244 fsd_exc_raise_code( FSD_ERRNO_INTERNAL_ERROR ); 245 break; 246 } 209 210 #ifndef PBS_PROFESSIONAL 211 fsd_log_error(("pbs_statjob error: %d, %s, %s", pbs_errno, pbse_to_txt(pbs_errno), pbs_strerror(pbs_errno))); 212 #else 213 # ifndef PBS_PROFESSIONAL_NO_LOG 214 fsd_log_error(("pbs_statjob error: %d, %s", pbs_errno, pbse_to_txt(pbs_errno))); 215 # else 216 fsd_log_error(("pbs_statjob error: %d", pbs_errno)); 217 # endif 218 #endif 219 220 switch( pbs_errno ) 221 { 222 case PBSE_UNKJOBID: 223 break; 224 case PBSE_PROTOCOL: 225 case PBSE_EXPIRED: 226 if ( session->pbs_conn >= 0 ) 227 pbs_disconnect( session->pbs_conn ); 228 sleep(1); 229 session->pbs_conn = pbs_connect( session->super.contact ); 230 if( session->pbs_conn < 0 ) 231 pbsdrmaa_exc_raise_pbs( "pbs_connect" ); 232 else 233 { 234 fsd_log_error(("retry:")); 235 goto retry; 236 } 237 default: 238 pbsdrmaa_exc_raise_pbs( "pbs_statjob" ); 239 break; 240 case 0: /* ? */ 241 fsd_exc_raise_code( FSD_ERRNO_INTERNAL_ERROR ); 242 break; 247 243 } 244 248 245 } 249 246 250 247 conn_lock = fsd_mutex_unlock( &self->session->drm_connection_mutex ); 251 248 249 if(pbsdrmaa_job_update_status_accounting(self) == false) 250 251 252 252 if( status != NULL ) 253 253 { … … 255 255 } 256 256 else if( self->state < DRMAA_PS_DONE ) 257 { 258 #ifndef PBS_PROFESSIONAL 259 /*best effort call*/ 260 if (pbsdrmaa_job_update_status_accounting(self) == false) 261 self->on_missing( self ); 262 #else 257 263 self->on_missing( self ); 264 #endif 265 } 258 266 } 259 267 FINALLY -
trunk/pbs_drmaa/log_reader.c
r24 r25 79 79 pbsdrmaa_date_compare(const void *a, const void *b) ; 80 80 81 /* 82 * Snippets from log files 83 * 84 * PBS Pro 85 * 86 10/11/2011 14:43:29;0008;Server@nova;Job;2127218.nova;Job Queued at request of mamonski@endor.wcss.wroc.pl, owner = mamonski@endor.wcss.wroc.pl, job name = STDIN, queue = normal 87 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Modified at request of Scheduler@nova.wcss.wroc.pl 88 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Run at request of Scheduler@nova.wcss.wroc.pl on exec_vnode (wn698:ncpus=3:mem=2048000kb)+(wn700:ncpus=3:mem=2048000kb) 89 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Modified at request of Scheduler@nova.wcss.wroc.pl 90 10/11/2011 14:43:32;0010;Server@nova;Job;2127218.nova;Exit_status=0 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=1768kb resources_used.ncpus=6 resources_used.vmem=19228kb resources_used.walltime=00:00:01 91 92 * 93 * Torque 94 * 95 10/11/2011 14:47:59;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Queued at request of plgmamonski@ui.cyf-kr.edu.pl, owner = plgmamonski@ui.cyf-kr.edu.pl, job name = STDIN, queue = l_short 96 10/11/2011 14:48:23;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Run at request of root@batch.grid.cyf-kr.edu.pl 97 10/11/2011 14:48:24;0010;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Exit_status=0 resources_used.cput=00:00:00 resources_used.mem=720kb resources_used.vmem=13308kb resources_used.walltime=00:00:00 98 99 */ 81 100 pbsdrmaa_log_reader_t * 82 101 pbsdrmaa_log_reader_new ( fsd_drmaa_session_t *session, fsd_job_t *job ) … … 263 282 { 264 283 const char *volatile ptr = line; 265 284 char field[256] = ""; 266 285 char job_id[256] = ""; 267 286 char event[256] = ""; … … 275 294 bool volatile older_job_found = false; 276 295 bool volatile job_found = false; 277 char *temp_date = NULL;296 char * temp_date = NULL; 278 297 279 298 struct batch_status status; … … 286 305 temp_date = fsd_strdup(field); 287 306 } 288 else if(field_n == FLD_EVENT && (strcmp(field,FLD_MSG_STATUS) == 0 || 289 strcmp(field,FLD_MSG_STATE) == 0 )) 307 else if(field_n == FLD_EVENT && (strcmp(field,FLD_MSG_STATUS) == 0 || strcmp(field,FLD_MSG_STATE) == 0 )) 290 308 { 291 309 /* event described by log line*/ 292 if(strlcpy(event, field,sizeof(event)) > sizeof(event)) { 310 if(strlcpy(event, field,sizeof(event)) > sizeof(event)) 311 { 293 312 fsd_log_error(("%s - strlcpy error",self->name)); 294 313 } 295 event_match = true; 314 event_match = true; 296 315 } 297 316 else if(event_match && field_n == FLD_ID) … … 351 370 { 352 371 /* parse msg - depends on FLD_EVENT */ 353 struct attrl struct_resource_cput,struct_resource_mem,struct_resource_vmem, 354 struct_resource_walltime, struct_status, struct_state, struct_start_time,struct_mtime, struct_queue, struct_account_name; 372 struct attrl struct_resource_cput, 373 struct_resource_mem, 374 struct_resource_vmem, 375 struct_resource_walltime, 376 struct_status, 377 struct_state, 378 struct_start_time, 379 struct_mtime, 380 struct_queue, 381 struct_account_name; 355 382 356 383 bool state_running = false; 357 384 358 memset(&struct_status,0,sizeof(struct attrl)); /**/385 memset(&struct_status,0,sizeof(struct attrl)); 359 386 memset(&struct_state,0,sizeof(struct attrl)); 360 387 memset(&struct_resource_cput,0,sizeof(struct attrl)); … … 375 402 struct_state.name = "job_state"; 376 403 if(field[0] == 'J') /* Job Queued, Job Modified, Job Run*/ 377 {404 { 378 405 n = 4; 379 406 if(older_job_found) /* job_on_missing - older job beginning - read this file and end */ 380 {407 { 381 408 self->run_flag = false; 382 409 fsd_log_debug(("Job_on_missing found older job beginning")); 383 410 fsd_free(status.name); 384 411 break; 385 }386 }387 if(field[4] == 'M') { /* modified */388 struct tm temp_time_tm;389 memset(&temp_time_tm, 0, sizeof(temp_time_tm));390 temp_time_tm.tm_isdst = -1;391 392 if (strptime(temp_date, "%m/%d/%Y %H:%M:%S", &temp_time_tm) == NULL)393 {394 fsd_log_error(("failed to parse mtime: %s", temp_date));395 412 } 396 else 397 { 398 time_t temp_time = mktime(&temp_time_tm); 399 status.attribs = &struct_mtime; 400 struct_mtime.name = "mtime"; 401 struct_mtime.next = NULL; 402 struct_mtime.value = fsd_asprintf("%lu",temp_time); 413 414 { /* modified */ 415 struct tm temp_time_tm; 416 memset(&temp_time_tm, 0, sizeof(temp_time_tm)); 417 temp_time_tm.tm_isdst = -1; 418 419 if (strptime(temp_date, "%m/%d/%Y %H:%M:%S", &temp_time_tm) == NULL) 420 { 421 fsd_log_error(("failed to parse mtime: %s (line = %s)", temp_date, line)); 422 } 423 else 424 { 425 time_t temp_time = mktime(&temp_time_tm); 426 status.attribs = &struct_mtime; 427 struct_mtime.name = "mtime"; 428 struct_mtime.next = NULL; 429 struct_mtime.value = fsd_asprintf("%lu",temp_time); 430 } 403 431 } 404 } 432 } 433 405 434 /* != Job deleted and Job to be deleted*/ 406 435 #ifdef PBS_PROFESSIONAL 407 436 else if (field[4] != 't' && field[10] != 'd') { 408 #else 437 #else 409 438 else if(field[4] != 'd') { 410 #endif 439 #endif 411 440 412 441 if ((struct_state.value = fsd_asprintf("%c",field[n]) ) == NULL ) { /* 4 first letter of state */ … … 422 451 struct_status.next = NULL; 423 452 struct_state.next = &struct_status; 424 struct_state.value = fsd_strdup("C"); 453 struct_state.value = fsd_strdup("C"); 425 454 } 426 } 455 } 427 456 else /*if (strcmp(event,FLD_MSG_STATUS) == 0 )*/ 428 457 { … … 476 505 break; 477 506 } 478 507 479 508 ptr2 += n2; 480 509 msg_field_n++; 481 510 if ( *ptr2 != ' ' ) 482 {483 511 break; 484 } 485 ++ptr2; 486 } 512 ++ptr2; 513 } 487 514 struct_state.value = fsd_strdup("C"); /* we got exit_status so we say that it has completed */ 488 515 fsd_log_info(("WT - job %s found as finished on %u", temp_job->job_id, (unsigned int)time(NULL))); … … 490 517 491 518 if(self->job == NULL) /* wait_thread */ 492 { 519 { 493 520 if ( state_running ) 494 521 { … … 510 537 pbsjob->update( temp_job, &status ); 511 538 } 512 513 514 539 } 540 else if( job_found ) /* job_on_missing */ 541 { 515 542 fsd_log_debug(("Job_on_missing - updating job: %s", self->job->job_id )); 516 543 pbsjob->update( self->job, &status ); 517 544 } 518 545 519 546 if(self->job == NULL) … … 541 568 else if(field_n == FLD_EVENT && strcmp(field,FLD_MSG_LOG) == 0) 542 569 { 543 log_event = true; 570 log_event = true; 544 571 } 545 572 else if (log_event && field_n == FLD_ID && strcmp(field,"Log") == 0 ) … … 575 602 576 603 FD_ZERO(&log_fds); 577 604 FD_SET(self->fd, &log_fds); 578 605 579 606 timeout_tv.tv_sec = 1; 580 607 timeout_tv.tv_usec = 0; 581 608 582 609 /* ignore return value - the next get line call will handle IO errors */ … … 589 616 } 590 617 EXCEPT_DEFAULT 591 618 { 592 619 const fsd_exc_t *e = fsd_exc_get(); 593 620 /* Its better to exit and communicate error rather then let the application to hang */ 594 621 fsd_log_fatal(( "Exception in wait thread %s: <%d:%s>. Exiting !!!", self->name, e->code(e), e->message(e) )); 595 622 exit(1); 596 623 } 597 624 END_TRY 598 625 … … 694 721 pbsdrmaa_date_compare(const void *a, const void *b) 695 722 { 696 697 698 723 const char *ia = *(const char **) a; 724 const char *ib = *(const char **) b; 725 return strcmp(ib, ia); 699 726 } 700 727 … … 1022 1049 break; 1023 1050 } 1024 1051 1025 1052 ptr2 += n2; 1026 1053 msg_field_n++; 1027 1054 if ( *ptr2 != ' ' ) 1028 { 1029 break; 1030 } 1031 ++ptr2; 1032 } 1055 break; 1056 1057 ++ptr2; 1058 } 1033 1059 } 1034 1035 if( job_found && status.attribs != NULL)1036 1060 1061 if( job_found && status.attribs != NULL) 1062 { 1037 1063 fsd_log_debug(("Accounting file - updating job: %s", self->job->job_id )); 1038 1064 pbsjob->update( self->job, &status ); 1039 1065 res = true; 1040 1066 } 1041 1067 1042 1068 if(self->job == NULL)
Note: See TracChangeset
for help on using the changeset viewer.