[12] | 1 | /* $Id$ */ |
---|
[7] | 2 | /* |
---|
| 3 | * FedStage DRMAA for PBS Pro |
---|
| 4 | * Copyright (C) 2006-2007 FedStage Systems |
---|
| 5 | * |
---|
| 6 | * This program is free software: you can redistribute it and/or modify |
---|
| 7 | * it under the terms of the GNU General Public License as published by |
---|
| 8 | * the Free Software Foundation, either version 3 of the License, or |
---|
| 9 | * (at your option) any later version. |
---|
| 10 | * |
---|
| 11 | * This program is distributed in the hope that it will be useful, |
---|
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 14 | * GNU General Public License for more details. |
---|
| 15 | * |
---|
| 16 | * You should have received a copy of the GNU General Public License |
---|
| 17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
| 18 | */ |
---|
| 19 | |
---|
| 20 | #ifdef HAVE_CONFIG_H |
---|
| 21 | # include <config.h> |
---|
| 22 | #endif |
---|
| 23 | |
---|
| 24 | #include <stdlib.h> |
---|
| 25 | #include <string.h> |
---|
| 26 | #include <unistd.h> |
---|
| 27 | #include <sys/select.h> |
---|
| 28 | #include <sys/stat.h> |
---|
| 29 | #include <sys/types.h> |
---|
| 30 | #include <dirent.h> |
---|
| 31 | #include <fcntl.h> |
---|
| 32 | |
---|
| 33 | #include <pbs_ifl.h> |
---|
| 34 | #include <pbs_error.h> |
---|
| 35 | |
---|
| 36 | #include <drmaa_utils/datetime.h> |
---|
| 37 | #include <drmaa_utils/drmaa.h> |
---|
| 38 | #include <drmaa_utils/iter.h> |
---|
| 39 | #include <drmaa_utils/conf.h> |
---|
| 40 | #include <drmaa_utils/session.h> |
---|
| 41 | #include <drmaa_utils/datetime.h> |
---|
| 42 | |
---|
| 43 | #include <pbs_drmaa/job.h> |
---|
| 44 | #include <pbs_drmaa/log_reader.h> |
---|
| 45 | #include <pbs_drmaa/session.h> |
---|
| 46 | #include <pbs_drmaa/submit.h> |
---|
| 47 | #include <pbs_drmaa/util.h> |
---|
[29] | 48 | #include <pbs_drmaa/pbs_attrib.h> |
---|
[7] | 49 | |
---|
| 50 | #include <errno.h> |
---|
| 51 | |
---|
[29] | 52 | enum pbsdrmaa_field_id |
---|
| 53 | { |
---|
| 54 | PBSDRMAA_FLD_ID_DATE = 0, |
---|
| 55 | PBSDRMAA_FLD_ID_EVENT = 1, |
---|
| 56 | PBSDRMAA_FLD_ID_SRC = 2, |
---|
| 57 | PBSDRMAA_FLD_ID_OBJ_TYPE = 3, |
---|
| 58 | PBSDRMAA_FLD_ID_OBJ_ID = 4, |
---|
| 59 | PBSDRMAA_FLD_ID_MSG = 5 |
---|
| 60 | }; |
---|
[7] | 61 | |
---|
| 62 | |
---|
[29] | 63 | #define PBSDRMAA_FLD_MSG_0008 "0008" |
---|
| 64 | #define PBSDRMAA_FLD_MSG_0010 "0010" |
---|
[7] | 65 | |
---|
[29] | 66 | enum pbsdrmaa_event_type |
---|
| 67 | { |
---|
| 68 | pbsdrmaa_event_0008 = 8, |
---|
| 69 | pbsdrmaa_event_0010 = 10 |
---|
| 70 | }; |
---|
[7] | 71 | |
---|
[29] | 72 | static void pbsdrmaa_read_log(); |
---|
[7] | 73 | |
---|
[29] | 74 | static void pbsdrmaa_select_file_wait_thread( pbsdrmaa_log_reader_t * self); |
---|
[21] | 75 | |
---|
[29] | 76 | char *pbsdrmaa_read_line_wait_thread( pbsdrmaa_log_reader_t * self); |
---|
[21] | 77 | |
---|
[29] | 78 | static time_t pbsdrmaa_parse_log_timestamp(const char *timestamp, char *unixtime_str, size_t size); |
---|
[21] | 79 | |
---|
[29] | 80 | static char *pbsdrmaa_get_exec_host_from_accountig(pbsdrmaa_log_reader_t * log_reader, const char *job_id); |
---|
[7] | 81 | |
---|
[25] | 82 | /* |
---|
| 83 | * Snippets from log files |
---|
| 84 | * |
---|
| 85 | * PBS Pro |
---|
| 86 | * |
---|
| 87 | 10/11/2011 14:43:29;0008;Server@nova;Job;2127218.nova;Job Queued at request of mamonski@endor.wcss.wroc.pl, owner = mamonski@endor.wcss.wroc.pl, job name = STDIN, queue = normal |
---|
| 88 | 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Modified at request of Scheduler@nova.wcss.wroc.pl |
---|
| 89 | 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Run at request of Scheduler@nova.wcss.wroc.pl on exec_vnode (wn698:ncpus=3:mem=2048000kb)+(wn700:ncpus=3:mem=2048000kb) |
---|
| 90 | 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Modified at request of Scheduler@nova.wcss.wroc.pl |
---|
| 91 | 10/11/2011 14:43:32;0010;Server@nova;Job;2127218.nova;Exit_status=0 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=1768kb resources_used.ncpus=6 resources_used.vmem=19228kb resources_used.walltime=00:00:01 |
---|
| 92 | |
---|
| 93 | * |
---|
| 94 | * Torque |
---|
| 95 | * |
---|
| 96 | 10/11/2011 14:47:59;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Queued at request of plgmamonski@ui.cyf-kr.edu.pl, owner = plgmamonski@ui.cyf-kr.edu.pl, job name = STDIN, queue = l_short |
---|
| 97 | 10/11/2011 14:48:23;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Run at request of root@batch.grid.cyf-kr.edu.pl |
---|
| 98 | 10/11/2011 14:48:24;0010;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Exit_status=0 resources_used.cput=00:00:00 resources_used.mem=720kb resources_used.vmem=13308kb resources_used.walltime=00:00:00 |
---|
| 99 | |
---|
[29] | 100 | deleting job: |
---|
| 101 | I . PBS Pro |
---|
| 102 | a) in Q state |
---|
| 103 | 10/16/2011 09:49:25;0008;Server@grass1;Job;2178.grass1.man.poznan.pl;Job Queued at request of mmamonski@grass1.man.poznan.pl, owner = mmamonski@grass1.man.poznan.pl, job name = STDIN, queue = workq |
---|
| 104 | 10/16/2011 09:49:25;0008;Server@grass1;Job;2178.grass1.man.poznan.pl;Job Modified at request of Scheduler@grass1.man.poznan.pl |
---|
| 105 | 10/16/2011 09:49:37;0008;Server@grass1;Job;2178.grass1.man.poznan.pl;Job to be deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 106 | 10/16/2011 09:49:37;0100;Server@grass1;Job;2178.grass1.man.poznan.pl;dequeuing from workq, state 5 |
---|
| 107 | |
---|
| 108 | |
---|
| 109 | b) in R state |
---|
| 110 | 10/16/2011 09:45:12;0080;Server@grass1;Job;2177.grass1.man.poznan.pl;delete job request received |
---|
| 111 | 10/16/2011 09:45:12;0008;Server@grass1;Job;2177.grass1.man.poznan.pl;Job sent signal TermJob on delete |
---|
| 112 | 10/16/2011 09:45:12;0008;Server@grass1;Job;2177.grass1.man.poznan.pl;Job to be deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 113 | 10/16/2011 09:45:12;0010;Server@grass1;Job;2177.grass1.man.poznan.pl;Exit_status=271 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=2772kb resources_used.ncpus=1 resources_used.vmem=199288kb resources_used.walltime=00:00:26 |
---|
| 114 | 10/16/2011 09:45:12;0100;Server@grass1;Job;2177.grass1.man.poznan.pl;dequeuing from workq, state 5 |
---|
| 115 | |
---|
| 116 | II. Torque |
---|
| 117 | a) in Q state |
---|
| 118 | 10/15/2011 21:19:25;0008;PBS_Server;Job;113045.grass1.man.poznan.pl;Job deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 119 | 10/15/2011 21:19:25;0100;PBS_Server;Job;113045.grass1.man.poznan.pl;dequeuing from batch, state EXITING |
---|
| 120 | |
---|
| 121 | b) in R state |
---|
| 122 | 10/15/2011 21:19:47;0008;PBS_Server;Job;113046.grass1.man.poznan.pl;Job deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 123 | 10/15/2011 21:19:47;0008;PBS_Server;Job;113046.grass1.man.poznan.pl;Job sent signal SIGTERM on delete |
---|
| 124 | 10/15/2011 21:19:47;0010;PBS_Server;Job;113046.grass1.man.poznan.pl;Exit_status=271 resources_used.cput=00:00:00 resources_used.mem=0kb resources_used.vmem=0kb resources_used.walltime=00:00:10 |
---|
| 125 | |
---|
| 126 | Log closed: |
---|
| 127 | 10/16/2011 00:00:17;0002;PBS_Server;Svr;Log;Log closed |
---|
| 128 | |
---|
[25] | 129 | */ |
---|
[7] | 130 | pbsdrmaa_log_reader_t * |
---|
[29] | 131 | pbsdrmaa_log_reader_new( fsd_drmaa_session_t *session ) |
---|
[7] | 132 | { |
---|
| 133 | pbsdrmaa_log_reader_t *volatile self = NULL; |
---|
| 134 | |
---|
| 135 | fsd_log_enter(("")); |
---|
[29] | 136 | |
---|
[7] | 137 | TRY |
---|
| 138 | { |
---|
| 139 | fsd_malloc(self, pbsdrmaa_log_reader_t ); |
---|
| 140 | |
---|
| 141 | self->session = session; |
---|
[29] | 142 | |
---|
| 143 | self->select_file = pbsdrmaa_select_file_wait_thread; |
---|
[7] | 144 | self->read_log = pbsdrmaa_read_log; |
---|
| 145 | |
---|
| 146 | self->run_flag = true; |
---|
[29] | 147 | self->fhandle = NULL; |
---|
[7] | 148 | self->date_changed = true; |
---|
| 149 | self->first_open = true; |
---|
| 150 | |
---|
| 151 | } |
---|
| 152 | EXCEPT_DEFAULT |
---|
| 153 | { |
---|
| 154 | if( self != NULL) |
---|
| 155 | fsd_free(self); |
---|
| 156 | |
---|
| 157 | fsd_exc_reraise(); |
---|
| 158 | } |
---|
| 159 | END_TRY |
---|
[29] | 160 | |
---|
[7] | 161 | fsd_log_return(("")); |
---|
[29] | 162 | |
---|
[7] | 163 | return self; |
---|
| 164 | } |
---|
| 165 | |
---|
[21] | 166 | |
---|
[7] | 167 | void |
---|
| 168 | pbsdrmaa_log_reader_destroy ( pbsdrmaa_log_reader_t * self ) |
---|
| 169 | { |
---|
| 170 | fsd_log_enter(("")); |
---|
| 171 | TRY |
---|
| 172 | { |
---|
| 173 | if(self != NULL) |
---|
| 174 | { |
---|
| 175 | fsd_free(self); |
---|
[29] | 176 | } |
---|
[7] | 177 | } |
---|
| 178 | EXCEPT_DEFAULT |
---|
| 179 | { |
---|
| 180 | fsd_exc_reraise(); |
---|
| 181 | } |
---|
| 182 | END_TRY |
---|
| 183 | |
---|
| 184 | fsd_log_return(("")); |
---|
| 185 | } |
---|
| 186 | |
---|
| 187 | |
---|
[29] | 188 | void |
---|
[7] | 189 | pbsdrmaa_read_log( pbsdrmaa_log_reader_t * self ) |
---|
| 190 | { |
---|
| 191 | fsd_log_enter(("")); |
---|
| 192 | |
---|
[29] | 193 | fsd_mutex_lock( &self->session->mutex ); |
---|
[8] | 194 | |
---|
[7] | 195 | TRY |
---|
[29] | 196 | { |
---|
[7] | 197 | while( self->run_flag ) |
---|
[29] | 198 | { |
---|
| 199 | TRY |
---|
[7] | 200 | { |
---|
[29] | 201 | char *line = NULL; |
---|
[8] | 202 | |
---|
[29] | 203 | self->select_file(self); |
---|
[7] | 204 | |
---|
[29] | 205 | while ((line = fsd_readline(self->fhandle)) != NULL) |
---|
| 206 | { |
---|
| 207 | int field_id = PBSDRMAA_FLD_ID_DATE; |
---|
| 208 | char *tok_ctx = NULL; |
---|
| 209 | char *field_token = NULL; |
---|
| 210 | char *event_timestamp = NULL; |
---|
| 211 | int event_type = -1; |
---|
| 212 | fsd_job_t *job = NULL; |
---|
[7] | 213 | |
---|
[29] | 214 | /* at first detect if this not the end of log file */ |
---|
| 215 | if (strstr(line, "Log;Log closed")) /*TODO try to be more effective and safe */ |
---|
| 216 | { |
---|
| 217 | fsd_log_debug(("WT - Date changed. Closing log file")); |
---|
| 218 | self->date_changed = true; |
---|
[30] | 219 | fsd_free(line); |
---|
| 220 | break; |
---|
[29] | 221 | } |
---|
[7] | 222 | |
---|
[29] | 223 | for (field_token = strtok_r(line, ";", &tok_ctx); field_token; field_token = strtok_r(NULL, ";", &tok_ctx), field_id++) |
---|
| 224 | { |
---|
| 225 | if ( field_id == PBSDRMAA_FLD_ID_DATE) |
---|
| 226 | { |
---|
| 227 | event_timestamp = field_token; |
---|
| 228 | } |
---|
| 229 | else if ( field_id == PBSDRMAA_FLD_ID_EVENT) |
---|
| 230 | { |
---|
| 231 | if (strncmp(field_token, PBSDRMAA_FLD_MSG_0008, 4) == 0) |
---|
| 232 | event_type = pbsdrmaa_event_0008; |
---|
| 233 | else if (strncmp(field_token, PBSDRMAA_FLD_MSG_0010, 4) == 0) |
---|
| 234 | event_type = pbsdrmaa_event_0010; |
---|
| 235 | else |
---|
[30] | 236 | { |
---|
| 237 | fsd_free(line); |
---|
[29] | 238 | break; /*we are interested only in the above log messages */ |
---|
[30] | 239 | } |
---|
[29] | 240 | } |
---|
| 241 | else if ( field_id == PBSDRMAA_FLD_ID_SRC) |
---|
| 242 | { |
---|
| 243 | /* not used ignore */ |
---|
| 244 | } |
---|
| 245 | else if (field_id == PBSDRMAA_FLD_ID_OBJ_TYPE) |
---|
| 246 | { |
---|
| 247 | if (strncmp(field_token, "Job", 3) != 0) |
---|
[30] | 248 | { |
---|
| 249 | fsd_free(line); |
---|
[29] | 250 | break; /* we are interested only in job events */ |
---|
[30] | 251 | } |
---|
[29] | 252 | } |
---|
| 253 | else if (field_id == PBSDRMAA_FLD_ID_OBJ_ID) |
---|
| 254 | { |
---|
| 255 | const char *event_jobid = field_token; |
---|
[26] | 256 | |
---|
[29] | 257 | TRY |
---|
[25] | 258 | { |
---|
[29] | 259 | job = self->session->get_job( self->session, event_jobid ); |
---|
| 260 | |
---|
| 261 | if( job ) |
---|
[25] | 262 | { |
---|
[29] | 263 | fsd_log_debug(("WT - Found job event: %s", event_jobid)); |
---|
| 264 | } |
---|
| 265 | else |
---|
| 266 | { |
---|
| 267 | fsd_log_debug(("WT - Unknown job: %s", event_jobid)); /* Not a DRMAA job */ |
---|
[30] | 268 | fsd_free(line); |
---|
[7] | 269 | break; |
---|
[25] | 270 | } |
---|
[29] | 271 | } |
---|
| 272 | END_TRY |
---|
| 273 | } |
---|
| 274 | else if (field_id == PBSDRMAA_FLD_ID_MSG) |
---|
| 275 | { |
---|
| 276 | char *msg = field_token; |
---|
| 277 | struct batch_status status; |
---|
| 278 | struct attrl *attribs = NULL; |
---|
| 279 | bool in_running_state = false; |
---|
[7] | 280 | |
---|
[29] | 281 | if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job Queued", 10) == 0) |
---|
| 282 | { |
---|
| 283 | /* Queued |
---|
| 284 | * PBS Pro: 10/11/2011 14:43:29;0008;Server@nova;Job;2127218.nova;Job Queued at request of mamonski@endor.wcss.wroc.pl, owner = mamonski@endor.wcss.wroc.pl, job name = STDIN, queue = normal |
---|
| 285 | * Torque: 10/11/2011 14:47:59;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Queued at request of plgmamonski@ui.cyf-kr.edu.pl, owner = plgmamonski@ui.cyf-kr.edu.pl, job name = STDIN, queue = l_short |
---|
| 286 | */ |
---|
| 287 | char *p_queue = NULL; |
---|
[25] | 288 | |
---|
[30] | 289 | fsd_log_info(("WT - Detected queued of job %s", job->job_id)); |
---|
| 290 | |
---|
[29] | 291 | if ((p_queue = strstr(msg,"queue =")) == NULL) |
---|
| 292 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"No queue attribute found in log line = %s", line); |
---|
[25] | 293 | |
---|
[30] | 294 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "Q"); |
---|
| 295 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_QUEUE, p_queue + 7); |
---|
| 296 | } |
---|
| 297 | else if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job Run", 7) == 0) |
---|
| 298 | { |
---|
| 299 | /* |
---|
| 300 | * Running |
---|
| 301 | * Torque: 10/11/2011 14:48:23;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Run at request of root@batch.grid.cyf-kr.edu.pl |
---|
| 302 | * PBS Pro: 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Run at request of Scheduler@nova.wcss.wroc.pl on exec_vnode (wn698:ncpus=3:mem=2048000kb)+(wn700:ncpus=3:mem=2048000kb) |
---|
| 303 | */ |
---|
| 304 | char timestamp_unix[64]; |
---|
| 305 | |
---|
| 306 | fsd_log_info(("WT - Detected start of job %s", job->job_id)); |
---|
| 307 | |
---|
[29] | 308 | (void)pbsdrmaa_parse_log_timestamp(event_timestamp, timestamp_unix, sizeof(timestamp_unix)); |
---|
| 309 | |
---|
| 310 | in_running_state = true; |
---|
| 311 | |
---|
| 312 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "R"); |
---|
| 313 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_START_TIME, timestamp_unix); |
---|
[26] | 314 | #ifdef PBS_PROFESSIONAL |
---|
| 315 | { |
---|
| 316 | char *p_vnode = NULL; |
---|
[27] | 317 | if ((p_vnode = strstr(field, "exec_vnode"))) |
---|
[26] | 318 | { |
---|
| 319 | last_attr->next = &struct_exec_vnode; |
---|
| 320 | last_attr = &struct_exec_vnode; |
---|
| 321 | struct_exec_vnode.name = "exec_vnode"; |
---|
| 322 | struct_exec_vnode.next = NULL; |
---|
[27] | 323 | struct_exec_vnode.value = fsd_strdup(p_vnode + 11); |
---|
[26] | 324 | } |
---|
| 325 | } |
---|
| 326 | #endif |
---|
| 327 | } |
---|
[29] | 328 | #ifndef PBS_PBS_PROFESSIONAL |
---|
| 329 | else if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job deleted", 11)) |
---|
| 330 | #else |
---|
| 331 | else if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job to be deleted", 17)) |
---|
| 332 | #endif |
---|
| 333 | { |
---|
| 334 | /* Deleted |
---|
| 335 | * PBS Pro: 10/16/2011 09:45:12;0008;Server@grass1;Job;2177.grass1.man.poznan.pl;Job to be deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 336 | * Torque: 10/15/2011 21:19:25;0008;PBS_Server;Job;113045.grass1.man.poznan.pl;Job deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 337 | */ |
---|
| 338 | char timestamp_unix[64]; |
---|
[7] | 339 | |
---|
[30] | 340 | fsd_log_info(("WT - Detected deletion of job %s", job->job_id)); |
---|
| 341 | |
---|
[29] | 342 | (void)pbsdrmaa_parse_log_timestamp(event_timestamp, timestamp_unix, sizeof(timestamp_unix)); |
---|
[7] | 343 | |
---|
[29] | 344 | if (job->state < DRMAA_PS_RUNNING) |
---|
| 345 | { |
---|
| 346 | fsd_log_info(("Job %s killed before entering running state (%d).", job->job_id, job->state)); |
---|
| 347 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "C"); |
---|
| 348 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_MTIME, timestamp_unix); |
---|
| 349 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXIT_STATUS, "-2"); |
---|
| 350 | } |
---|
| 351 | else |
---|
| 352 | { |
---|
[30] | 353 | job->release( job ); |
---|
| 354 | fsd_free(line); |
---|
[29] | 355 | break; /* job was started, ignore, wait for Exit_status message */ |
---|
| 356 | } |
---|
| 357 | } |
---|
| 358 | else if (event_type == pbsdrmaa_event_0010 && strncmp(msg, "Exit_status=", 12)) |
---|
| 359 | { |
---|
| 360 | /* Completed: |
---|
| 361 | * PBS Pro: 10/11/2011 14:43:32;0010;Server@nova;Job;2127218.nova;Exit_status=0 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=1768kb resources_used.ncpus=6 resources_used.vmem=19228kb resources_used.walltime=00:00:01 |
---|
| 362 | * Torque: 10/11/2011 14:48:24;0010;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Exit_status=0 resources_used.cput=00:00:00 resources_used.mem=720kb resources_used.vmem=13308kb resources_used.walltime=00:00:00 |
---|
| 363 | */ |
---|
| 364 | char timestamp_unix[64]; |
---|
| 365 | time_t timestamp_time_t = pbsdrmaa_parse_log_timestamp(event_timestamp, timestamp_unix, sizeof(timestamp_unix)); |
---|
| 366 | char *tok_ctx2 = NULL; |
---|
| 367 | char *token = NULL; |
---|
[7] | 368 | |
---|
[29] | 369 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "C"); |
---|
| 370 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_MTIME, timestamp_unix); |
---|
[7] | 371 | |
---|
[29] | 372 | /* tokenize !!! */ |
---|
| 373 | for (token = strtok_r(msg, " ", &tok_ctx2); token; token = strtok_r(NULL, " ", &tok_ctx2)) |
---|
| 374 | { |
---|
| 375 | if (strncmp(token, "Exit_status=", 12) == 0) |
---|
| 376 | { |
---|
| 377 | token[12] = '\0'; |
---|
| 378 | attribs = pbsdrmaa_add_attr(attribs, token, token + 12); |
---|
| 379 | fsd_log_info(("WT - Completion of job %s (Exit_status=%s) detected after %d seconds", job->job_id, token+12, (int)(time(NULL) - timestamp_time_t) )); |
---|
| 380 | } |
---|
| 381 | else if (strncmp(token, "resources_used.cput=", 20) == 0) |
---|
| 382 | { |
---|
| 383 | token[20] = '\0'; |
---|
| 384 | attribs = pbsdrmaa_add_attr(attribs, token, token + 20); |
---|
| 385 | } |
---|
| 386 | else if (strncmp(token, "resources_used.mem=", 19) == 0) |
---|
| 387 | { |
---|
| 388 | token[19] = '\0'; |
---|
| 389 | attribs = pbsdrmaa_add_attr(attribs, token, token + 19); |
---|
| 390 | } |
---|
| 391 | else if (strncmp(token, "resources_used.vmem=", 20) == 0) |
---|
| 392 | { |
---|
| 393 | token[20] = '\0'; |
---|
| 394 | attribs = pbsdrmaa_add_attr(attribs, token, token + 20); |
---|
| 395 | } |
---|
| 396 | else if (strncmp(token, "resources_used.walltime=", 24) == 0) |
---|
| 397 | { |
---|
| 398 | token[24] = '\0'; |
---|
| 399 | attribs = pbsdrmaa_add_attr(attribs, token, token + 24); |
---|
| 400 | } |
---|
| 401 | } |
---|
[7] | 402 | |
---|
[29] | 403 | if (!job->execution_hosts) |
---|
| 404 | { |
---|
| 405 | char *exec_host = NULL; |
---|
| 406 | fsd_log_info(("WT - No execution host information for job %s. Reading accounting logs...", job->job_id)); |
---|
| 407 | exec_host = pbsdrmaa_get_exec_host_from_accountig(self, job->job_id); |
---|
| 408 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXECUTION_HOST, exec_host); |
---|
| 409 | fsd_free(exec_host); |
---|
| 410 | } |
---|
| 411 | } |
---|
| 412 | else |
---|
| 413 | { |
---|
[30] | 414 | job->release( job ); |
---|
| 415 | fsd_free(line); |
---|
[29] | 416 | break; /* ignore other job events*/ |
---|
| 417 | } |
---|
[25] | 418 | |
---|
[29] | 419 | if ( in_running_state ) |
---|
| 420 | { |
---|
| 421 | fsd_log_debug(("WT - forcing update of job: %s", job->job_id )); |
---|
[18] | 422 | TRY |
---|
| 423 | { |
---|
[29] | 424 | job->update_status( job ); |
---|
[18] | 425 | } |
---|
| 426 | EXCEPT_DEFAULT |
---|
| 427 | { |
---|
| 428 | /*TODO: distinguish between invalid job and internal errors */ |
---|
[29] | 429 | fsd_log_debug(("Job finished just after entering running state: %s", job->job_id)); |
---|
[18] | 430 | } |
---|
| 431 | END_TRY |
---|
[29] | 432 | } |
---|
[7] | 433 | else |
---|
[29] | 434 | { |
---|
| 435 | fsd_log_debug(("WT - updating job: %s", job->job_id )); |
---|
| 436 | status.name = job->job_id; |
---|
| 437 | status.attribs = attribs; |
---|
| 438 | |
---|
| 439 | ((pbsdrmaa_job_t *)job)->update( job, &status ); |
---|
| 440 | |
---|
| 441 | pbsdrmaa_free_attrl(attribs); /* TODO free on exception */ |
---|
| 442 | } |
---|
| 443 | |
---|
| 444 | fsd_cond_broadcast( &job->status_cond); |
---|
[7] | 445 | fsd_cond_broadcast( &self->session->wait_condition ); |
---|
| 446 | |
---|
[29] | 447 | if ( job ) |
---|
| 448 | job->release( job ); |
---|
| 449 | |
---|
| 450 | fsd_free(line); /* TODO free on exception */ |
---|
| 451 | } |
---|
| 452 | else |
---|
| 453 | { |
---|
| 454 | fsd_assert(0); /*not reached */ |
---|
| 455 | } |
---|
| 456 | } |
---|
| 457 | |
---|
| 458 | } /* end of while getline loop */ |
---|
| 459 | |
---|
[30] | 460 | |
---|
[29] | 461 | { /* poll on log file */ |
---|
| 462 | struct timeval timeout_tv; |
---|
| 463 | fd_set log_fds; |
---|
| 464 | |
---|
| 465 | fsd_mutex_unlock( &self->session->mutex ); |
---|
[7] | 466 | |
---|
[29] | 467 | FD_ZERO(&log_fds); |
---|
| 468 | FD_SET(fileno(self->fhandle), &log_fds); |
---|
[7] | 469 | |
---|
[29] | 470 | timeout_tv.tv_sec = 1; |
---|
| 471 | timeout_tv.tv_usec = 0; |
---|
[19] | 472 | |
---|
[29] | 473 | /* ignore return value - the next get line call will handle IO errors */ |
---|
| 474 | (void)select(1, &log_fds, NULL, NULL, &timeout_tv); |
---|
[19] | 475 | |
---|
[29] | 476 | fsd_mutex_lock( &self->session->mutex ); |
---|
[19] | 477 | |
---|
[29] | 478 | self->run_flag = self->session->wait_thread_run_flag; |
---|
| 479 | } |
---|
[7] | 480 | } |
---|
[29] | 481 | EXCEPT_DEFAULT |
---|
| 482 | { |
---|
| 483 | const fsd_exc_t *e = fsd_exc_get(); |
---|
| 484 | /* Its better to exit and communicate error rather then let the application to hang */ |
---|
| 485 | fsd_log_fatal(( "Exception in wait thread: <%d:%s>. Exiting !!!", e->code(e), e->message(e) )); |
---|
| 486 | exit(1); |
---|
| 487 | } |
---|
| 488 | END_TRY |
---|
| 489 | } |
---|
[7] | 490 | |
---|
[29] | 491 | if(self->fhandle) |
---|
| 492 | fclose(self->fhandle); |
---|
| 493 | |
---|
| 494 | fsd_log_debug(("WT - Log file closed")); |
---|
[7] | 495 | } |
---|
| 496 | FINALLY |
---|
| 497 | { |
---|
[29] | 498 | fsd_log_debug(("WT - Terminated.")); |
---|
| 499 | fsd_mutex_unlock( &self->session->mutex ); /**/ |
---|
[7] | 500 | } |
---|
| 501 | END_TRY |
---|
| 502 | |
---|
| 503 | fsd_log_return(("")); |
---|
| 504 | } |
---|
| 505 | |
---|
| 506 | void |
---|
[8] | 507 | pbsdrmaa_select_file_wait_thread ( pbsdrmaa_log_reader_t * self ) |
---|
[7] | 508 | { |
---|
| 509 | pbsdrmaa_session_t *pbssession = (pbsdrmaa_session_t*) self->session; |
---|
| 510 | |
---|
[29] | 511 | if (self->date_changed) |
---|
| 512 | { |
---|
[7] | 513 | char * log_path = NULL; |
---|
| 514 | int num_tries = 0; |
---|
| 515 | struct tm tm; |
---|
| 516 | |
---|
| 517 | fsd_log_enter(("")); |
---|
| 518 | |
---|
| 519 | if(!self->first_open) |
---|
| 520 | time(&self->t); |
---|
| 521 | else |
---|
| 522 | self->t = pbssession->log_file_initial_time; |
---|
| 523 | |
---|
| 524 | localtime_r(&self->t,&tm); |
---|
| 525 | |
---|
| 526 | #define DRMAA_WAIT_THREAD_MAX_TRIES (12) |
---|
| 527 | /* generate new date, close file and open new */ |
---|
[29] | 528 | log_path = fsd_asprintf("%s/server_logs/%04d%02d%02d", pbssession->pbs_home, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); |
---|
[7] | 529 | |
---|
[29] | 530 | if(self->fhandle) |
---|
| 531 | fclose(self->fhandle); |
---|
[7] | 532 | |
---|
[29] | 533 | fsd_log_info(("Opening log file: %s",log_path)); |
---|
[7] | 534 | |
---|
| 535 | retry: |
---|
[29] | 536 | if ((self->fhandle = fopen(log_path,"")) == NULL && (num_tries > DRMAA_WAIT_THREAD_MAX_TRIES || self->first_open)) |
---|
| 537 | { |
---|
[7] | 538 | fsd_log_error(("Can't open log file. Verify pbs_home. Running standard wait_thread.")); |
---|
[29] | 539 | fsd_log_error(("Remember that without keep_completed set the standard wait_thread won't provide information about job exit status")); |
---|
[7] | 540 | /*pbssession->super.enable_wait_thread = false;*/ /* run not wait_thread */ |
---|
| 541 | pbssession->wait_thread_log = false; |
---|
| 542 | pbssession->super.wait_thread = pbssession->super_wait_thread; |
---|
| 543 | pbssession->super.wait_thread(self->session); |
---|
[29] | 544 | } |
---|
| 545 | else if ( self->fhandle == NULL ) |
---|
| 546 | { /* Torque seems not to create a new file immediately after the old one is closed */ |
---|
[7] | 547 | fsd_log_warning(("Can't open log file: %s. Retries count: %d", log_path, num_tries)); |
---|
| 548 | num_tries++; |
---|
[29] | 549 | sleep(2 * num_tries); |
---|
[7] | 550 | goto retry; |
---|
[29] | 551 | } |
---|
[7] | 552 | |
---|
| 553 | fsd_free(log_path); |
---|
| 554 | |
---|
| 555 | fsd_log_debug(("Log file opened")); |
---|
| 556 | |
---|
[29] | 557 | if(self->first_open) |
---|
| 558 | { |
---|
[7] | 559 | fsd_log_debug(("Log file lseek")); |
---|
[29] | 560 | |
---|
| 561 | if(fseek(self->fhandle, pbssession->log_file_initial_size, SEEK_SET) == (off_t) -1) |
---|
| 562 | { |
---|
| 563 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"fseek error"); |
---|
| 564 | } |
---|
[7] | 565 | self->first_open = false; |
---|
[29] | 566 | } |
---|
[7] | 567 | |
---|
| 568 | self->date_changed = false; |
---|
| 569 | |
---|
| 570 | fsd_log_return(("")); |
---|
| 571 | } |
---|
| 572 | } |
---|
| 573 | |
---|
[29] | 574 | time_t |
---|
| 575 | pbsdrmaa_parse_log_timestamp(const char *timestamp, char *unixtime_str, size_t size) |
---|
[7] | 576 | { |
---|
[29] | 577 | struct tm temp_time_tm; |
---|
| 578 | memset(&temp_time_tm, 0, sizeof(temp_time_tm)); |
---|
| 579 | temp_time_tm.tm_isdst = -1; |
---|
[7] | 580 | |
---|
[29] | 581 | if (strptime(timestamp, "%m/%d/%Y %H:%M:%S", &temp_time_tm) == NULL) |
---|
| 582 | { |
---|
| 583 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"WT - failed to parse log timestamp: %s", timestamp); |
---|
| 584 | } |
---|
[7] | 585 | else |
---|
[29] | 586 | { |
---|
| 587 | time_t temp_time = mktime(&temp_time_tm); |
---|
| 588 | snprintf(unixtime_str, size, "%lu", temp_time); |
---|
| 589 | return temp_time; |
---|
| 590 | } |
---|
[7] | 591 | } |
---|
| 592 | |
---|
[29] | 593 | char * |
---|
| 594 | pbsdrmaa_get_exec_host_from_accountig(pbsdrmaa_log_reader_t * log_reader, const char *job_id) |
---|
[7] | 595 | { |
---|
[29] | 596 | /* TODO: implement */ |
---|
| 597 | return NULL; |
---|
[7] | 598 | } |
---|
| 599 | |
---|
[21] | 600 | |
---|