- Timestamp:
- 11/11/12 15:06:16 (13 years ago)
- Location:
- trunk
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/ll_drmaa/Makefile.am
r20 r26 33 33 libdrmaa_la_LDFLAGS = @LL_LDFLAGS@ -version-info @LL_DRMAA_VERSION_INFO@ 34 34 35 bin_PROGRAMS = monitor35 bin_PROGRAMS = lldrmaa_monitor 36 36 37 monitor_SOURCES = monitor.c37 lldrmaa_monitor_SOURCES = monitor.c 38 38 39 monitor_CPPFLAGS = -I$(top_srcdir)/drmaa_utils39 lldrmaa_monitor_CPPFLAGS = -I$(top_srcdir)/drmaa_utils 40 40 41 41 dist_sysconf_DATA = ll_drmaa.conf.example -
trunk/ll_drmaa/drmaa.c
r20 r26 17 17 */ 18 18 19 #include <sys/timers.h>20 19 #include <sys/time.h> 21 20 #include <sys/wait.h> -
trunk/ll_drmaa/job.c
r20 r26 35 35 36 36 #include <llapi.h> 37 38 #define LL_DRMAA_MAX_MISSING_TIME (60) 37 39 38 40 static void … … 287 289 } 288 290 289 data = ll_get_objs(job, LL_ SCHEDD, NULL, &obj_count, &err_code);291 data = ll_get_objs(job, LL_CM, NULL, &obj_count, &err_code); 290 292 if (data == NULL) { 291 fsd_log_debug(("Code: %d ll_get_objs() returns NULL. %s",lldrmaa_map_get_objs(err_code), lldrmaa_err_get_objs(err_code) )); 292 /* This error means that there is no info in LL_SCHEDD but this job have probably ended and will be detected by wait_thread. This won't be fsd_log_warning because displays too often during program execution */ 293 if (err_code == LL_GET_OBJS_NO_OBJECTS_ERR) 294 { 295 if (self->state != DRMAA_PS_UNDETERMINED) 296 { 297 fsd_log_info(("Job %s missing. Assuming finished", self->job_id)); 298 self->state = DRMAA_PS_DONE; 299 self->exit_status = 0; 300 } 301 else if (llself->missing_time == 0) 302 { 303 llself->missing_time = time(NULL); 304 fsd_log_debug(("Job %s missing for the first time", self->job_id)); /* Job may not yet be visible in LL */ 305 } 306 else if (time(NULL) - llself->missing_time > LL_DRMAA_MAX_MISSING_TIME) 307 { 308 fsd_log_error(("Job %s missing for more then %d seconds. Assuming failef", self->job_id, LL_DRMAA_MAX_MISSING_TIME)); 309 /* Job may not yet be visible in LL */ 310 self->state = DRMAA_PS_FAILED; 311 self->exit_status = -1; 312 } 313 else 314 { 315 fsd_log_debug(("Job %s still missing", self->job_id)); 316 } 317 } 318 else 319 { 320 fsd_log_error(("Code: %d,%d ll_get_objs() returns NULL. %s", err_code, lldrmaa_map_get_objs(err_code), lldrmaa_err_get_objs(err_code) )); 321 } 293 322 } else { 294 323 … … 312 341 } 313 342 314 llself->read_job_info( 315 316 fsd_log_info(("LL State: %d -> DRMAA: %s", step_state,drmaa_job_ps_to_str(self->state)));343 llself->read_job_info(self, step_state, hold_type); 344 345 fsd_log_info(("LL State: %d -> DRMAA: %s", step_state, drmaa_job_ps_to_str(self->state))); 317 346 } 318 347 } … … 352 381 self->super.control = lldrmaa_job_control; 353 382 self->super.update_status = lldrmaa_job_update_status; 383 self->missing_time = 0; 354 384 self->read_job_info_mon = lldrmaa_job_read_job_info_mon; 355 385 self->read_job_info = lldrmaa_job_read_job_info; … … 471 501 } 472 502 473 TRY474 {475 const char *command = NULL;476 char *command_expanded = NULL;477 const char *const *i;478 int j;479 480 /* remote command */481 command = jt->get_attr( jt, DRMAA_REMOTE_COMMAND );482 if( command == NULL )483 fsd_exc_raise_msg(484 FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES,485 "drmaa_remote_command not set for job template"486 );487 488 command_expanded = expand->expand( expand, fsd_strdup(command), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );489 490 fprintf(fd,"# @ executable = %s\n", command_expanded);491 fsd_log_debug(("# @ executable = %s\n", command_expanded));492 493 fsd_free(command_expanded);494 495 /* arguments list */496 vector = jt->get_v_attr( jt, DRMAA_V_ARGV );497 498 if( vector )499 {500 fprintf(fd,"# @ arguments =");501 fsd_log_debug(("# @ arguments ="));502 503 for( i = vector, j = 2; *i; i++, j++ )504 {505 char *arg_expanded = expand->expand( expand, fsd_strdup(*i), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );506 507 fprintf(fd," '%s'", arg_expanded);508 fsd_log_debug(("%s", arg_expanded));509 510 fsd_free(arg_expanded);511 }512 }513 514 fprintf(fd," \n");515 }516 END_TRY517 518 503 /* start time */ 519 504 value = jt->get_attr( jt, DRMAA_START_TIME ); … … 779 764 fprintf(fd,"# @ queue\n"); 780 765 fsd_log_debug(("# @ queue")); 766 767 fprintf(fd,"\n"); 768 fprintf(fd,"echo >&2\n"); /* this line forces creation of stderr file */ 769 770 TRY 771 { 772 const char *command = NULL; 773 char *command_expanded = NULL; 774 const char *const *i; 775 int j; 776 777 /* remote command */ 778 command = jt->get_attr( jt, DRMAA_REMOTE_COMMAND ); 779 if( command == NULL ) 780 fsd_exc_raise_msg( 781 FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES, 782 "drmaa_remote_command not set for job template" 783 ); 784 785 command_expanded = expand->expand( expand, fsd_strdup(command), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD ); 786 787 fprintf(fd,"%s", command_expanded); /* we put the commmand at the end of script (instead of using @executable keyword 788 in roder to avoid coping of the binary */ 789 fsd_log_debug(("command = %s\n", command_expanded)); 790 791 fsd_free(command_expanded); 792 793 /* arguments list */ 794 vector = jt->get_v_attr( jt, DRMAA_V_ARGV ); 795 796 if( vector ) 797 { 798 fsd_log_debug(("arguments = ")); 799 800 for( i = vector, j = 2; *i; i++, j++ ) 801 { 802 char *arg_expanded = expand->expand( expand, fsd_strdup(*i), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD ); 803 804 fprintf(fd," '%s'", arg_expanded); 805 fsd_log_debug(("%s", arg_expanded)); 806 807 fsd_free(arg_expanded); 808 } 809 } 810 811 fprintf(fd," \n"); 812 } 813 END_TRY 814 815 781 816 } 782 817 } -
trunk/ll_drmaa/job.h
r20 r26 36 36 struct lldrmaa_job_s { 37 37 fsd_job_t super; 38 time_t missing_time; 38 39 39 40 void (*read_job_info_mon)( fsd_job_t *self, const char * state , unsigned status); -
trunk/ll_drmaa/session.c
r20 r26 132 132 LL_job job_info; 133 133 134 char *monitor_program = LL_DRMAA_BIN_DIR"/ monitor";134 char *monitor_program = LL_DRMAA_BIN_DIR"/lldrmaa_monitor"; 135 135 136 136 TRY … … 147 147 148 148 connection_lock = fsd_mutex_lock( &self->drm_connection_mutex ); 149 status = llsubmit(cmd_path, monitor_program, llself->unix_socket_name, &job_info, LL_JOB_VERSION); 149 if (self->wait_thread_run_flag) 150 { 151 fsd_log_info(("llsubmit(%s, %s, %s, %p, %d)",cmd_path, monitor_program, llself->unix_socket_name, (void*)&job_info, LL_JOB_VERSION)); 152 status = llsubmit(cmd_path, monitor_program, llself->unix_socket_name, &job_info, LL_JOB_VERSION); 153 } 154 else 155 { 156 fsd_log_info(("llsubmit(%s, NULL, NULL, %p, %d)",cmd_path, (void*)&job_info, LL_JOB_VERSION)); 157 status = llsubmit(cmd_path, NULL, NULL, &job_info, LL_JOB_VERSION); 158 } 150 159 connection_lock = fsd_mutex_unlock( &self->drm_connection_mutex ); 151 160 152 if( remove(cmd_path) == -1)161 if(getenv("LLDRMAA_KEEP_CMD") == NULL && remove(cmd_path) == -1) 153 162 fsd_log_warning(("Can't delete cmd file: %s", cmd_path)); 154 163 … … 161 170 } 162 171 else /* 0 */ 172 { 163 173 fsd_log_debug(("llsubmit: %s",lldrmaa_err_submit(status))); 174 } 164 175 165 176 if( start != end ) … … 170 181 { 171 182 job_ids[i] = fsd_asprintf("%s.%d.%d", job_info.step_list[i]->id.from_host, job_info.step_list[i]->id.cluster, job_info.step_list[i]->id.proc); 172 183 fsd_log_info((" new array job id: %s", job_ids[i])); 173 184 job = lldrmaa_job_new( fsd_strdup(job_ids[i]) ); 174 185 job->session = self; … … 185 196 job_ids[0] = fsd_asprintf( "%s.%d.0", job_info.step_list[0]->id.from_host, job_info.step_list[0]->id.cluster); 186 197 198 fsd_log_info((" new job id: %s", job_ids[0])); 187 199 job = lldrmaa_job_new( fsd_strdup(job_ids[0]) ); 188 200 job->session = self; … … 247 259 } 248 260 249 if ( !self->wait_thread_started )250 fsd_exc_raise_msg(FSD_ERRNO_INTERNAL_ERROR, "DRMAA for LL requires that wait thread is enable. Don't disable it in configuration file!" );251 261 } 252 262 -
trunk/m4/ax_ll.m4
r20 r26 45 45 LL_INCLUDES="-I${with_ll_inc}" 46 46 else 47 LLSUBMIT_PATH=` which llsubmit`47 LLSUBMIT_PATH=`readlink -f $(which llsubmit)` 48 48 if test x"$LLSUBMIT_PATH" != x; then 49 49 LLSUBMIT_DIR=`dirname $LLSUBMIT_PATH` 50 50 LL_HOME=`dirname $LLSUBMIT_DIR` 51 51 LL_INCLUDES="-I$LL_HOME/include" 52 AC_MSG_NOTICE([no --with-ll-inc given. Using llsubmit path based guess: $LL_INCLUDES]) 52 53 else 53 54 ax_ll_msg="no llsubmit in PATH" … … 61 62 if test x$with_ll_lib == x; then 62 63 63 LLSUBMIT_PATH=` which llsubmit`64 LLSUBMIT_PATH=`readlink -f $(which llsubmit)` 64 65 65 66 if test x"$LLSUBMIT_PATH" != x; then … … 67 68 LL_HOME=`dirname $LLSUBMIT_DIR` 68 69 with_ll_lib=$LL_HOME/lib 70 AC_MSG_NOTICE([no --with-ll-lib given. Using llsubmit path based guess: $with_ll_lib]) 69 71 else 70 72 ax_ll_msg="no llsubmit in PATH"
Note: See TracChangeset
for help on using the changeset viewer.