Changeset 45 for trunk/pbs_drmaa/session.c
- Timestamp:
- 11/28/11 15:02:58 (12 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/pbs_drmaa/session.c
r41 r45 65 65 pbsdrmaa_session_new_job( fsd_drmaa_session_t *self, const char *job_id ); 66 66 67 static bool68 pbsdrmaa_session_do_drm_keeps_completed_jobs( pbsdrmaa_session_t *self );69 70 67 static void 71 68 pbsdrmaa_session_update_all_jobs_status( fsd_drmaa_session_t *self ); … … 115 112 self->super.apply_configuration = pbsdrmaa_session_apply_configuration; 116 113 117 self->do_drm_keeps_completed_jobs =118 pbsdrmaa_session_do_drm_keeps_completed_jobs;119 120 114 self->status_attrl = pbsdrmaa_create_status_attrl(); 121 122 { /* ugly. But this is life... ;( */ 123 #define MAX_PBS_CONNECT_RETRIES (3) 124 int tries_counter = MAX_PBS_CONNECT_RETRIES; 125 retry: 126 self->pbs_conn = pbs_connect( self->super.contact ); 127 fsd_log_info(( "pbs_connect(%s) =%d", self->super.contact, 128 self->pbs_conn )); 129 if( self->pbs_conn < 0 && tries_counter-- ) 130 { 131 sleep(1); 132 goto retry; 133 } 134 } 135 if( self->pbs_conn < 0 ) 136 pbsdrmaa_exc_raise_pbs( "pbs_connect" ); 115 self->max_retries_count = 3; 116 self->wait_thread_sleep_time = 1; 137 117 138 118 self->super.load_configuration( &self->super, "pbs_drmaa" ); 139 119 140 120 self->super.missing_jobs = FSD_IGNORE_MISSING_JOBS; 141 if( self->do_drm_keeps_completed_jobs( self ) ) 142 self->super.missing_jobs = FSD_IGNORE_QUEUED_MISSING_JOBS; 121 122 { 123 int tries_left = self->max_retries_count; 124 int sleep_time = 1; 125 retry_connect: /* Life... */ 126 self->pbs_conn = pbs_connect( self->super.contact ); 127 fsd_log_info(( "pbs_connect(%s) =%d", self->super.contact, self->pbs_conn )); 128 if( self->pbs_conn < 0 && tries_left-- ) 129 { 130 sleep(sleep_time++); 131 goto retry_connect; 132 } 133 134 if( self->pbs_conn < 0 ) 135 pbsdrmaa_exc_raise_pbs( "pbs_connect" ); 136 } 143 137 } 144 138 EXCEPT_DEFAULT … … 223 217 { 224 218 pbsdrmaa_session_t *pbsself = (pbsdrmaa_session_t*)self; 225 fsd_conf_option_t *pbs_home; 219 fsd_conf_option_t *pbs_home = NULL; 220 fsd_conf_option_t *wait_thread_sleep_time = NULL; 221 fsd_conf_option_t *max_retries_count = NULL; 222 226 223 pbs_home = fsd_conf_dict_get(self->configuration, "pbs_home" ); 224 wait_thread_sleep_time = fsd_conf_dict_get(self->configuration, "wait_thread_sleep_time" ); 225 max_retries_count = fsd_conf_dict_get(self->configuration, "max_retries_count" ); 227 226 228 227 if( pbs_home && pbs_home->type == FSD_CONF_STRING ) … … 259 258 } 260 259 260 if ( max_retries_count && max_retries_count->type == FSD_CONF_INTEGER) 261 { 262 pbsself->max_retries_count = max_retries_count->val.integer; 263 fsd_log_info(("Max retries count: %d", pbsself->max_retries_count)); 264 } 265 266 if ( wait_thread_sleep_time && wait_thread_sleep_time->type == FSD_CONF_INTEGER) 267 { 268 pbsself->wait_thread_sleep_time = wait_thread_sleep_time->val.integer; 269 fsd_log_info(("Wait thread sleep time: %d", pbsself->wait_thread_sleep_time)); 270 } 271 261 272 pbsself->super_apply_configuration(self); /* call method from the superclass */ 262 273 } … … 271 282 fsd_job_set_t *jobs = self->jobs; 272 283 struct batch_status *volatile status = NULL; 284 volatile int tries_left = pbsself->max_retries_count; 285 volatile int sleep_time = 1; 273 286 274 287 fsd_log_enter(("")); … … 284 297 status = pbs_statjob( pbsself->pbs_conn, NULL, pbsself->status_attrl, NULL ); 285 298 #endif 286 fsd_log_info(( "pbs_statjob( fd=%d, job_id=NULL, attribs={...} ) =%p", 287 pbsself->pbs_conn, (void*)status )); 299 fsd_log_info(( "pbs_statjob( fd=%d, job_id=NULL, attribs={...} ) =%p", pbsself->pbs_conn, (void*)status )); 288 300 if( status == NULL && pbs_errno != 0 ) 289 301 { … … 292 304 if ( pbsself->pbs_conn >= 0) 293 305 pbs_disconnect( pbsself->pbs_conn ); 294 sleep(1); 306 retry_connect: 307 sleep(sleep_time++); 295 308 pbsself->pbs_conn = pbs_connect( pbsself->super.contact ); 296 if( pbsself->pbs_conn < 0 ) 297 pbsdrmaa_exc_raise_pbs( "pbs_connect" ); 309 if( pbsself->pbs_conn < 0) 310 { 311 if (tries_left--) 312 goto retry_connect; 313 else 314 pbsdrmaa_exc_raise_pbs( "pbs_connect" ); 315 } 298 316 else 299 317 goto retry; … … 429 447 } 430 448 431 432 bool433 pbsdrmaa_session_do_drm_keeps_completed_jobs( pbsdrmaa_session_t *self )434 {435 436 #ifndef PBS_PROFESSIONAL437 struct attrl default_queue_query;438 struct attrl keep_completed_query;439 struct batch_status *default_queue_result = NULL;440 struct batch_status *keep_completed_result = NULL;441 const char *default_queue = NULL;442 const char *keep_completed = NULL;443 volatile bool result = false;444 volatile bool conn_lock = false;445 446 TRY447 {448 default_queue_query.next = NULL;449 default_queue_query.name = "default_queue";450 default_queue_query.resource = NULL;451 default_queue_query.value = NULL;452 keep_completed_query.next = NULL;453 keep_completed_query.name = "keep_completed";454 keep_completed_query.resource = NULL;455 keep_completed_query.value = NULL;456 457 conn_lock = fsd_mutex_lock( &self->super.drm_connection_mutex );458 459 default_queue_result =460 pbs_statserver( self->pbs_conn, &default_queue_query, NULL );461 if( default_queue_result == NULL )462 pbsdrmaa_exc_raise_pbs( "pbs_statserver" );463 if( default_queue_result->attribs464 && !strcmp( default_queue_result->attribs->name,465 "default_queue" ) )466 default_queue = default_queue_result->attribs->value;467 468 fsd_log_debug(( "default_queue: %s", default_queue ));469 470 if( default_queue )471 {472 keep_completed_result = pbs_statque( self->pbs_conn,473 (char*)default_queue, &keep_completed_query, NULL );474 if( keep_completed_result == NULL )475 pbsdrmaa_exc_raise_pbs( "pbs_statque" );476 if( keep_completed_result->attribs477 && !strcmp( keep_completed_result->attribs->name,478 "keep_completed" ) )479 keep_completed = keep_completed_result->attribs->value;480 }481 482 fsd_log_debug(( "keep_completed: %s", keep_completed ));483 }484 EXCEPT_DEFAULT485 {486 const fsd_exc_t *e = fsd_exc_get();487 fsd_log_warning(( "PBS server seems not to keep completed jobs\n"488 "detail: %s", e->message(e) ));489 result = false;490 }491 ELSE492 {493 result = false;494 if( default_queue == NULL )495 fsd_log_warning(( "no default queue set on PBS server" ));496 else if( keep_completed == NULL && self->pbs_home == NULL )497 fsd_log_warning(( "Torque server is not configured to keep completed jobs\n"498 "in Torque: set keep_completed parameter of default queue\n"499 " $ qmgr -c 'set queue batch keep_completed = 60'\n"500 " or configure DRMAA to utilize log files"501 ));502 else503 result = true;504 }505 FINALLY506 {507 if( default_queue_result )508 pbs_statfree( default_queue_result );509 if( keep_completed_result )510 pbs_statfree( keep_completed_result );511 if( conn_lock )512 conn_lock = fsd_mutex_unlock( &self->super.drm_connection_mutex );513 514 }515 END_TRY516 517 return result;518 #endif519 fsd_log_warning(( "PBS Professional does not keep information about the completed jobs\n"520 " You must configure DRMAA to utilize log files in order to always get valid job exit status"521 ));522 return false;523 }524 525 449 void * 526 450 pbsdrmaa_session_wait_thread( fsd_drmaa_session_t *self )
Note: See TracChangeset
for help on using the changeset viewer.