[12] | 1 | /* $Id$ */ |
---|
[7] | 2 | /* |
---|
| 3 | * FedStage DRMAA for PBS Pro |
---|
| 4 | * Copyright (C) 2006-2007 FedStage Systems |
---|
| 5 | * |
---|
| 6 | * This program is free software: you can redistribute it and/or modify |
---|
| 7 | * it under the terms of the GNU General Public License as published by |
---|
| 8 | * the Free Software Foundation, either version 3 of the License, or |
---|
| 9 | * (at your option) any later version. |
---|
| 10 | * |
---|
| 11 | * This program is distributed in the hope that it will be useful, |
---|
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 14 | * GNU General Public License for more details. |
---|
| 15 | * |
---|
| 16 | * You should have received a copy of the GNU General Public License |
---|
| 17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
| 18 | */ |
---|
| 19 | |
---|
| 20 | #ifdef HAVE_CONFIG_H |
---|
| 21 | # include <config.h> |
---|
| 22 | #endif |
---|
| 23 | |
---|
| 24 | #include <stdlib.h> |
---|
[35] | 25 | #include <ctype.h> |
---|
[7] | 26 | #include <string.h> |
---|
| 27 | #include <unistd.h> |
---|
| 28 | #include <sys/stat.h> |
---|
| 29 | #include <sys/types.h> |
---|
| 30 | #include <dirent.h> |
---|
| 31 | #include <fcntl.h> |
---|
| 32 | |
---|
| 33 | #include <pbs_ifl.h> |
---|
| 34 | #include <pbs_error.h> |
---|
| 35 | |
---|
| 36 | #include <drmaa_utils/datetime.h> |
---|
| 37 | #include <drmaa_utils/drmaa.h> |
---|
| 38 | #include <drmaa_utils/iter.h> |
---|
| 39 | #include <drmaa_utils/conf.h> |
---|
| 40 | #include <drmaa_utils/session.h> |
---|
| 41 | #include <drmaa_utils/datetime.h> |
---|
| 42 | |
---|
| 43 | #include <pbs_drmaa/job.h> |
---|
| 44 | #include <pbs_drmaa/log_reader.h> |
---|
| 45 | #include <pbs_drmaa/session.h> |
---|
| 46 | #include <pbs_drmaa/submit.h> |
---|
| 47 | #include <pbs_drmaa/util.h> |
---|
[29] | 48 | #include <pbs_drmaa/pbs_attrib.h> |
---|
[7] | 49 | |
---|
| 50 | #include <errno.h> |
---|
| 51 | |
---|
[29] | 52 | enum pbsdrmaa_field_id |
---|
| 53 | { |
---|
| 54 | PBSDRMAA_FLD_ID_DATE = 0, |
---|
| 55 | PBSDRMAA_FLD_ID_EVENT = 1, |
---|
| 56 | PBSDRMAA_FLD_ID_SRC = 2, |
---|
| 57 | PBSDRMAA_FLD_ID_OBJ_TYPE = 3, |
---|
| 58 | PBSDRMAA_FLD_ID_OBJ_ID = 4, |
---|
| 59 | PBSDRMAA_FLD_ID_MSG = 5 |
---|
| 60 | }; |
---|
[7] | 61 | |
---|
| 62 | |
---|
[29] | 63 | #define PBSDRMAA_FLD_MSG_0008 "0008" |
---|
| 64 | #define PBSDRMAA_FLD_MSG_0010 "0010" |
---|
[7] | 65 | |
---|
[29] | 66 | enum pbsdrmaa_event_type |
---|
| 67 | { |
---|
| 68 | pbsdrmaa_event_0008 = 8, |
---|
| 69 | pbsdrmaa_event_0010 = 10 |
---|
| 70 | }; |
---|
[7] | 71 | |
---|
[29] | 72 | static void pbsdrmaa_read_log(); |
---|
[7] | 73 | |
---|
[48] | 74 | static void pbsdrmaa_select_file( pbsdrmaa_log_reader_t * self); |
---|
[21] | 75 | |
---|
[48] | 76 | static void pbsdrmaa_close_log( pbsdrmaa_log_reader_t * self); |
---|
[21] | 77 | |
---|
[48] | 78 | static void pbsdrmaa_reopen_log( pbsdrmaa_log_reader_t * self); |
---|
| 79 | |
---|
[29] | 80 | static time_t pbsdrmaa_parse_log_timestamp(const char *timestamp, char *unixtime_str, size_t size); |
---|
[21] | 81 | |
---|
[29] | 82 | static char *pbsdrmaa_get_exec_host_from_accountig(pbsdrmaa_log_reader_t * log_reader, const char *job_id); |
---|
[7] | 83 | |
---|
[25] | 84 | /* |
---|
| 85 | * Snippets from log files |
---|
| 86 | * |
---|
| 87 | * PBS Pro |
---|
| 88 | * |
---|
| 89 | 10/11/2011 14:43:29;0008;Server@nova;Job;2127218.nova;Job Queued at request of mamonski@endor.wcss.wroc.pl, owner = mamonski@endor.wcss.wroc.pl, job name = STDIN, queue = normal |
---|
| 90 | 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Modified at request of Scheduler@nova.wcss.wroc.pl |
---|
| 91 | 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Run at request of Scheduler@nova.wcss.wroc.pl on exec_vnode (wn698:ncpus=3:mem=2048000kb)+(wn700:ncpus=3:mem=2048000kb) |
---|
| 92 | 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Modified at request of Scheduler@nova.wcss.wroc.pl |
---|
| 93 | 10/11/2011 14:43:32;0010;Server@nova;Job;2127218.nova;Exit_status=0 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=1768kb resources_used.ncpus=6 resources_used.vmem=19228kb resources_used.walltime=00:00:01 |
---|
| 94 | |
---|
| 95 | * |
---|
| 96 | * Torque |
---|
| 97 | * |
---|
| 98 | 10/11/2011 14:47:59;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Queued at request of plgmamonski@ui.cyf-kr.edu.pl, owner = plgmamonski@ui.cyf-kr.edu.pl, job name = STDIN, queue = l_short |
---|
| 99 | 10/11/2011 14:48:23;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Run at request of root@batch.grid.cyf-kr.edu.pl |
---|
| 100 | 10/11/2011 14:48:24;0010;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Exit_status=0 resources_used.cput=00:00:00 resources_used.mem=720kb resources_used.vmem=13308kb resources_used.walltime=00:00:00 |
---|
| 101 | |
---|
[29] | 102 | deleting job: |
---|
| 103 | I . PBS Pro |
---|
| 104 | a) in Q state |
---|
| 105 | 10/16/2011 09:49:25;0008;Server@grass1;Job;2178.grass1.man.poznan.pl;Job Queued at request of mmamonski@grass1.man.poznan.pl, owner = mmamonski@grass1.man.poznan.pl, job name = STDIN, queue = workq |
---|
| 106 | 10/16/2011 09:49:25;0008;Server@grass1;Job;2178.grass1.man.poznan.pl;Job Modified at request of Scheduler@grass1.man.poznan.pl |
---|
| 107 | 10/16/2011 09:49:37;0008;Server@grass1;Job;2178.grass1.man.poznan.pl;Job to be deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 108 | 10/16/2011 09:49:37;0100;Server@grass1;Job;2178.grass1.man.poznan.pl;dequeuing from workq, state 5 |
---|
| 109 | |
---|
| 110 | |
---|
| 111 | b) in R state |
---|
| 112 | 10/16/2011 09:45:12;0080;Server@grass1;Job;2177.grass1.man.poznan.pl;delete job request received |
---|
| 113 | 10/16/2011 09:45:12;0008;Server@grass1;Job;2177.grass1.man.poznan.pl;Job sent signal TermJob on delete |
---|
| 114 | 10/16/2011 09:45:12;0008;Server@grass1;Job;2177.grass1.man.poznan.pl;Job to be deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 115 | 10/16/2011 09:45:12;0010;Server@grass1;Job;2177.grass1.man.poznan.pl;Exit_status=271 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=2772kb resources_used.ncpus=1 resources_used.vmem=199288kb resources_used.walltime=00:00:26 |
---|
| 116 | 10/16/2011 09:45:12;0100;Server@grass1;Job;2177.grass1.man.poznan.pl;dequeuing from workq, state 5 |
---|
| 117 | |
---|
| 118 | II. Torque |
---|
| 119 | a) in Q state |
---|
| 120 | 10/15/2011 21:19:25;0008;PBS_Server;Job;113045.grass1.man.poznan.pl;Job deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 121 | 10/15/2011 21:19:25;0100;PBS_Server;Job;113045.grass1.man.poznan.pl;dequeuing from batch, state EXITING |
---|
| 122 | |
---|
| 123 | b) in R state |
---|
| 124 | 10/15/2011 21:19:47;0008;PBS_Server;Job;113046.grass1.man.poznan.pl;Job deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 125 | 10/15/2011 21:19:47;0008;PBS_Server;Job;113046.grass1.man.poznan.pl;Job sent signal SIGTERM on delete |
---|
| 126 | 10/15/2011 21:19:47;0010;PBS_Server;Job;113046.grass1.man.poznan.pl;Exit_status=271 resources_used.cput=00:00:00 resources_used.mem=0kb resources_used.vmem=0kb resources_used.walltime=00:00:10 |
---|
| 127 | |
---|
| 128 | Log closed: |
---|
| 129 | 10/16/2011 00:00:17;0002;PBS_Server;Svr;Log;Log closed |
---|
| 130 | |
---|
[25] | 131 | */ |
---|
[7] | 132 | pbsdrmaa_log_reader_t * |
---|
[29] | 133 | pbsdrmaa_log_reader_new( fsd_drmaa_session_t *session ) |
---|
[7] | 134 | { |
---|
| 135 | pbsdrmaa_log_reader_t *volatile self = NULL; |
---|
| 136 | |
---|
| 137 | fsd_log_enter(("")); |
---|
[29] | 138 | |
---|
[7] | 139 | TRY |
---|
| 140 | { |
---|
| 141 | fsd_malloc(self, pbsdrmaa_log_reader_t ); |
---|
| 142 | |
---|
| 143 | self->session = session; |
---|
[29] | 144 | |
---|
[48] | 145 | self->select_file = pbsdrmaa_select_file; |
---|
[7] | 146 | self->read_log = pbsdrmaa_read_log; |
---|
[48] | 147 | self->close = pbsdrmaa_close_log; |
---|
| 148 | self->reopen = pbsdrmaa_reopen_log; |
---|
[7] | 149 | |
---|
| 150 | self->run_flag = true; |
---|
[29] | 151 | self->fhandle = NULL; |
---|
[7] | 152 | self->date_changed = true; |
---|
| 153 | self->first_open = true; |
---|
[48] | 154 | self->log_path = NULL; |
---|
| 155 | self->current_offset = 0; |
---|
[7] | 156 | |
---|
| 157 | } |
---|
| 158 | EXCEPT_DEFAULT |
---|
| 159 | { |
---|
| 160 | if( self != NULL) |
---|
| 161 | fsd_free(self); |
---|
| 162 | |
---|
| 163 | fsd_exc_reraise(); |
---|
| 164 | } |
---|
| 165 | END_TRY |
---|
[29] | 166 | |
---|
[7] | 167 | fsd_log_return(("")); |
---|
[29] | 168 | |
---|
[7] | 169 | return self; |
---|
| 170 | } |
---|
| 171 | |
---|
[21] | 172 | |
---|
[7] | 173 | void |
---|
| 174 | pbsdrmaa_log_reader_destroy ( pbsdrmaa_log_reader_t * self ) |
---|
| 175 | { |
---|
| 176 | fsd_log_enter(("")); |
---|
| 177 | TRY |
---|
| 178 | { |
---|
| 179 | if(self != NULL) |
---|
| 180 | { |
---|
| 181 | fsd_free(self); |
---|
[29] | 182 | } |
---|
[7] | 183 | } |
---|
| 184 | EXCEPT_DEFAULT |
---|
| 185 | { |
---|
| 186 | fsd_exc_reraise(); |
---|
| 187 | } |
---|
| 188 | END_TRY |
---|
| 189 | |
---|
| 190 | fsd_log_return(("")); |
---|
| 191 | } |
---|
| 192 | |
---|
| 193 | |
---|
[29] | 194 | void |
---|
[7] | 195 | pbsdrmaa_read_log( pbsdrmaa_log_reader_t * self ) |
---|
| 196 | { |
---|
| 197 | fsd_log_enter(("")); |
---|
| 198 | |
---|
[29] | 199 | fsd_mutex_lock( &self->session->mutex ); |
---|
[8] | 200 | |
---|
[7] | 201 | TRY |
---|
[29] | 202 | { |
---|
[7] | 203 | while( self->run_flag ) |
---|
[29] | 204 | { |
---|
| 205 | TRY |
---|
[7] | 206 | { |
---|
[29] | 207 | char *line = NULL; |
---|
[8] | 208 | |
---|
[29] | 209 | self->select_file(self); |
---|
[7] | 210 | |
---|
[29] | 211 | while ((line = fsd_readline(self->fhandle)) != NULL) |
---|
| 212 | { |
---|
| 213 | int field_id = PBSDRMAA_FLD_ID_DATE; |
---|
| 214 | char *tok_ctx = NULL; |
---|
| 215 | char *field_token = NULL; |
---|
| 216 | char *event_timestamp = NULL; |
---|
| 217 | int event_type = -1; |
---|
| 218 | fsd_job_t *job = NULL; |
---|
[7] | 219 | |
---|
[29] | 220 | /* at first detect if this not the end of log file */ |
---|
| 221 | if (strstr(line, "Log;Log closed")) /*TODO try to be more effective and safe */ |
---|
| 222 | { |
---|
| 223 | fsd_log_debug(("WT - Date changed. Closing log file")); |
---|
| 224 | self->date_changed = true; |
---|
[31] | 225 | goto cleanup; |
---|
[29] | 226 | } |
---|
[7] | 227 | |
---|
[29] | 228 | for (field_token = strtok_r(line, ";", &tok_ctx); field_token; field_token = strtok_r(NULL, ";", &tok_ctx), field_id++) |
---|
| 229 | { |
---|
| 230 | if ( field_id == PBSDRMAA_FLD_ID_DATE) |
---|
| 231 | { |
---|
| 232 | event_timestamp = field_token; |
---|
| 233 | } |
---|
| 234 | else if ( field_id == PBSDRMAA_FLD_ID_EVENT) |
---|
| 235 | { |
---|
| 236 | if (strncmp(field_token, PBSDRMAA_FLD_MSG_0008, 4) == 0) |
---|
| 237 | event_type = pbsdrmaa_event_0008; |
---|
| 238 | else if (strncmp(field_token, PBSDRMAA_FLD_MSG_0010, 4) == 0) |
---|
| 239 | event_type = pbsdrmaa_event_0010; |
---|
| 240 | else |
---|
[30] | 241 | { |
---|
[31] | 242 | goto cleanup; /*we are interested only in the above log messages */ |
---|
[30] | 243 | } |
---|
[29] | 244 | } |
---|
| 245 | else if ( field_id == PBSDRMAA_FLD_ID_SRC) |
---|
| 246 | { |
---|
| 247 | /* not used ignore */ |
---|
| 248 | } |
---|
| 249 | else if (field_id == PBSDRMAA_FLD_ID_OBJ_TYPE) |
---|
| 250 | { |
---|
| 251 | if (strncmp(field_token, "Job", 3) != 0) |
---|
[30] | 252 | { |
---|
[31] | 253 | goto cleanup; /* we are interested only in job events */ |
---|
[30] | 254 | } |
---|
[29] | 255 | } |
---|
| 256 | else if (field_id == PBSDRMAA_FLD_ID_OBJ_ID) |
---|
| 257 | { |
---|
| 258 | const char *event_jobid = field_token; |
---|
[31] | 259 | |
---|
| 260 | if (!isdigit(event_jobid[0])) |
---|
| 261 | { |
---|
| 262 | fsd_log_debug(("WT - Invalid job: %s", event_jobid)); |
---|
| 263 | goto cleanup; |
---|
| 264 | } |
---|
[26] | 265 | |
---|
[31] | 266 | job = self->session->get_job( self->session, event_jobid ); |
---|
| 267 | |
---|
| 268 | if( job ) |
---|
[25] | 269 | { |
---|
[31] | 270 | fsd_log_debug(("WT - Found job event: %s", event_jobid)); |
---|
[29] | 271 | } |
---|
[31] | 272 | else |
---|
| 273 | { |
---|
| 274 | fsd_log_debug(("WT - Unknown job: %s", event_jobid)); /* Not a DRMAA job */ |
---|
| 275 | goto cleanup; |
---|
| 276 | } |
---|
| 277 | } |
---|
[29] | 278 | else if (field_id == PBSDRMAA_FLD_ID_MSG) |
---|
| 279 | { |
---|
| 280 | char *msg = field_token; |
---|
| 281 | struct batch_status status; |
---|
| 282 | struct attrl *attribs = NULL; |
---|
| 283 | bool in_running_state = false; |
---|
[7] | 284 | |
---|
[29] | 285 | if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job Queued", 10) == 0) |
---|
| 286 | { |
---|
| 287 | /* Queued |
---|
| 288 | * PBS Pro: 10/11/2011 14:43:29;0008;Server@nova;Job;2127218.nova;Job Queued at request of mamonski@endor.wcss.wroc.pl, owner = mamonski@endor.wcss.wroc.pl, job name = STDIN, queue = normal |
---|
| 289 | * Torque: 10/11/2011 14:47:59;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Queued at request of plgmamonski@ui.cyf-kr.edu.pl, owner = plgmamonski@ui.cyf-kr.edu.pl, job name = STDIN, queue = l_short |
---|
| 290 | */ |
---|
| 291 | char *p_queue = NULL; |
---|
[25] | 292 | |
---|
[31] | 293 | fsd_log_info(("WT - Detected queuing of job %s", job->job_id)); |
---|
[30] | 294 | |
---|
[29] | 295 | if ((p_queue = strstr(msg,"queue =")) == NULL) |
---|
| 296 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"No queue attribute found in log line = %s", line); |
---|
[25] | 297 | |
---|
[30] | 298 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "Q"); |
---|
| 299 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_QUEUE, p_queue + 7); |
---|
| 300 | } |
---|
| 301 | else if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job Run", 7) == 0) |
---|
| 302 | { |
---|
| 303 | /* |
---|
| 304 | * Running |
---|
| 305 | * Torque: 10/11/2011 14:48:23;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Run at request of root@batch.grid.cyf-kr.edu.pl |
---|
| 306 | * PBS Pro: 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Run at request of Scheduler@nova.wcss.wroc.pl on exec_vnode (wn698:ncpus=3:mem=2048000kb)+(wn700:ncpus=3:mem=2048000kb) |
---|
| 307 | */ |
---|
| 308 | char timestamp_unix[64]; |
---|
| 309 | |
---|
| 310 | fsd_log_info(("WT - Detected start of job %s", job->job_id)); |
---|
| 311 | |
---|
[29] | 312 | (void)pbsdrmaa_parse_log_timestamp(event_timestamp, timestamp_unix, sizeof(timestamp_unix)); |
---|
| 313 | |
---|
| 314 | in_running_state = true; |
---|
| 315 | |
---|
| 316 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "R"); |
---|
| 317 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_START_TIME, timestamp_unix); |
---|
[26] | 318 | #ifdef PBS_PROFESSIONAL |
---|
| 319 | { |
---|
| 320 | char *p_vnode = NULL; |
---|
[32] | 321 | if ((p_vnode = strstr(msg, "exec_vnode"))) |
---|
[26] | 322 | { |
---|
[32] | 323 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXECUTION_VNODE, p_vnode + 11); |
---|
[26] | 324 | } |
---|
| 325 | } |
---|
| 326 | #endif |
---|
| 327 | } |
---|
[36] | 328 | #ifndef PBS_PROFESSIONAL |
---|
[33] | 329 | else if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job deleted", 11) == 0) |
---|
[29] | 330 | #else |
---|
[33] | 331 | else if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job to be deleted", 17) == 0) |
---|
[29] | 332 | #endif |
---|
| 333 | { |
---|
| 334 | /* Deleted |
---|
| 335 | * PBS Pro: 10/16/2011 09:45:12;0008;Server@grass1;Job;2177.grass1.man.poznan.pl;Job to be deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 336 | * Torque: 10/15/2011 21:19:25;0008;PBS_Server;Job;113045.grass1.man.poznan.pl;Job deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 337 | */ |
---|
| 338 | char timestamp_unix[64]; |
---|
[7] | 339 | |
---|
[30] | 340 | fsd_log_info(("WT - Detected deletion of job %s", job->job_id)); |
---|
| 341 | |
---|
[29] | 342 | (void)pbsdrmaa_parse_log_timestamp(event_timestamp, timestamp_unix, sizeof(timestamp_unix)); |
---|
[7] | 343 | |
---|
[29] | 344 | if (job->state < DRMAA_PS_RUNNING) |
---|
| 345 | { |
---|
[33] | 346 | fsd_log_info(("WT - Job %s killed before entering running state (%d).", job->job_id, job->state)); |
---|
[29] | 347 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "C"); |
---|
| 348 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_MTIME, timestamp_unix); |
---|
| 349 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXIT_STATUS, "-2"); |
---|
| 350 | } |
---|
| 351 | else |
---|
| 352 | { |
---|
[36] | 353 | fsd_log_info(("WT - Job %s killed after entering running state (%d). Waiting for Completed event...", job->job_id, job->state)); |
---|
[31] | 354 | goto cleanup; /* job was started, ignore, wait for Exit_status message */ |
---|
[29] | 355 | } |
---|
| 356 | } |
---|
[31] | 357 | else if (event_type == pbsdrmaa_event_0010 && (strncmp(msg, "Exit_status=", 12) == 0)) |
---|
[29] | 358 | { |
---|
| 359 | /* Completed: |
---|
| 360 | * PBS Pro: 10/11/2011 14:43:32;0010;Server@nova;Job;2127218.nova;Exit_status=0 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=1768kb resources_used.ncpus=6 resources_used.vmem=19228kb resources_used.walltime=00:00:01 |
---|
| 361 | * Torque: 10/11/2011 14:48:24;0010;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Exit_status=0 resources_used.cput=00:00:00 resources_used.mem=720kb resources_used.vmem=13308kb resources_used.walltime=00:00:00 |
---|
| 362 | */ |
---|
| 363 | char timestamp_unix[64]; |
---|
| 364 | time_t timestamp_time_t = pbsdrmaa_parse_log_timestamp(event_timestamp, timestamp_unix, sizeof(timestamp_unix)); |
---|
| 365 | char *tok_ctx2 = NULL; |
---|
| 366 | char *token = NULL; |
---|
[7] | 367 | |
---|
[29] | 368 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "C"); |
---|
| 369 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_MTIME, timestamp_unix); |
---|
[7] | 370 | |
---|
[29] | 371 | /* tokenize !!! */ |
---|
| 372 | for (token = strtok_r(msg, " ", &tok_ctx2); token; token = strtok_r(NULL, " ", &tok_ctx2)) |
---|
| 373 | { |
---|
| 374 | if (strncmp(token, "Exit_status=", 12) == 0) |
---|
| 375 | { |
---|
[31] | 376 | token[11] = '\0'; |
---|
[33] | 377 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXIT_STATUS, token + 12); |
---|
[29] | 378 | fsd_log_info(("WT - Completion of job %s (Exit_status=%s) detected after %d seconds", job->job_id, token+12, (int)(time(NULL) - timestamp_time_t) )); |
---|
| 379 | } |
---|
| 380 | else if (strncmp(token, "resources_used.cput=", 20) == 0) |
---|
| 381 | { |
---|
[31] | 382 | token[19] = '\0'; |
---|
[29] | 383 | attribs = pbsdrmaa_add_attr(attribs, token, token + 20); |
---|
| 384 | } |
---|
| 385 | else if (strncmp(token, "resources_used.mem=", 19) == 0) |
---|
| 386 | { |
---|
[31] | 387 | token[18] = '\0'; |
---|
[29] | 388 | attribs = pbsdrmaa_add_attr(attribs, token, token + 19); |
---|
| 389 | } |
---|
| 390 | else if (strncmp(token, "resources_used.vmem=", 20) == 0) |
---|
| 391 | { |
---|
[31] | 392 | token[19] = '\0'; |
---|
[29] | 393 | attribs = pbsdrmaa_add_attr(attribs, token, token + 20); |
---|
| 394 | } |
---|
| 395 | else if (strncmp(token, "resources_used.walltime=", 24) == 0) |
---|
| 396 | { |
---|
[31] | 397 | token[23] = '\0'; |
---|
[29] | 398 | attribs = pbsdrmaa_add_attr(attribs, token, token + 24); |
---|
| 399 | } |
---|
| 400 | } |
---|
[7] | 401 | |
---|
[29] | 402 | if (!job->execution_hosts) |
---|
| 403 | { |
---|
| 404 | char *exec_host = NULL; |
---|
| 405 | fsd_log_info(("WT - No execution host information for job %s. Reading accounting logs...", job->job_id)); |
---|
| 406 | exec_host = pbsdrmaa_get_exec_host_from_accountig(self, job->job_id); |
---|
[34] | 407 | if (exec_host) |
---|
| 408 | { |
---|
| 409 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXECUTION_HOST, exec_host); |
---|
| 410 | fsd_free(exec_host); |
---|
| 411 | } |
---|
[29] | 412 | } |
---|
| 413 | } |
---|
| 414 | else |
---|
| 415 | { |
---|
[36] | 416 | fsd_log_debug(("Ignoring msg(type=%d) = %s", event_type, msg)); |
---|
[31] | 417 | goto cleanup; /* ignore other job events*/ |
---|
[29] | 418 | } |
---|
[35] | 419 | |
---|
| 420 | fsd_log_debug(("WT - updating job: %s", job->job_id )); |
---|
| 421 | status.name = job->job_id; |
---|
| 422 | status.attribs = attribs; |
---|
[25] | 423 | |
---|
[35] | 424 | ((pbsdrmaa_job_t *)job)->update( job, &status ); |
---|
| 425 | |
---|
[29] | 426 | if ( in_running_state ) |
---|
| 427 | { |
---|
| 428 | fsd_log_debug(("WT - forcing update of job: %s", job->job_id )); |
---|
[18] | 429 | TRY |
---|
| 430 | { |
---|
[29] | 431 | job->update_status( job ); |
---|
[18] | 432 | } |
---|
| 433 | EXCEPT_DEFAULT |
---|
| 434 | { |
---|
| 435 | /*TODO: distinguish between invalid job and internal errors */ |
---|
[29] | 436 | fsd_log_debug(("Job finished just after entering running state: %s", job->job_id)); |
---|
[18] | 437 | } |
---|
| 438 | END_TRY |
---|
[29] | 439 | } |
---|
| 440 | |
---|
| 441 | |
---|
[35] | 442 | pbsdrmaa_free_attrl(attribs); /* TODO free on exception */ |
---|
[29] | 443 | |
---|
| 444 | fsd_cond_broadcast( &job->status_cond); |
---|
[7] | 445 | fsd_cond_broadcast( &self->session->wait_condition ); |
---|
| 446 | |
---|
[29] | 447 | } |
---|
| 448 | else |
---|
| 449 | { |
---|
| 450 | fsd_assert(0); /*not reached */ |
---|
| 451 | } |
---|
| 452 | } |
---|
[31] | 453 | cleanup: |
---|
| 454 | fsd_free(line); /* TODO what about exceptions */ |
---|
| 455 | if ( job ) |
---|
| 456 | job->release( job ); |
---|
[29] | 457 | |
---|
[31] | 458 | |
---|
| 459 | |
---|
[29] | 460 | } /* end of while getline loop */ |
---|
| 461 | |
---|
[30] | 462 | |
---|
[29] | 463 | |
---|
[33] | 464 | fsd_mutex_unlock( &self->session->mutex ); |
---|
[7] | 465 | |
---|
[48] | 466 | /* close */ |
---|
| 467 | self->close(self); |
---|
| 468 | |
---|
[45] | 469 | sleep(((pbsdrmaa_session_t *)self->session)->wait_thread_sleep_time); |
---|
| 470 | |
---|
[48] | 471 | /* and reopen log file */ |
---|
| 472 | self->reopen(self); |
---|
| 473 | |
---|
[33] | 474 | fsd_mutex_lock( &self->session->mutex ); |
---|
[19] | 475 | |
---|
[33] | 476 | self->run_flag = self->session->wait_thread_run_flag; |
---|
[7] | 477 | } |
---|
[29] | 478 | EXCEPT_DEFAULT |
---|
| 479 | { |
---|
| 480 | const fsd_exc_t *e = fsd_exc_get(); |
---|
| 481 | /* Its better to exit and communicate error rather then let the application to hang */ |
---|
| 482 | fsd_log_fatal(( "Exception in wait thread: <%d:%s>. Exiting !!!", e->code(e), e->message(e) )); |
---|
| 483 | exit(1); |
---|
| 484 | } |
---|
| 485 | END_TRY |
---|
| 486 | } |
---|
[7] | 487 | |
---|
[29] | 488 | if(self->fhandle) |
---|
| 489 | fclose(self->fhandle); |
---|
| 490 | |
---|
| 491 | fsd_log_debug(("WT - Log file closed")); |
---|
[7] | 492 | } |
---|
| 493 | FINALLY |
---|
| 494 | { |
---|
[29] | 495 | fsd_log_debug(("WT - Terminated.")); |
---|
| 496 | fsd_mutex_unlock( &self->session->mutex ); /**/ |
---|
[7] | 497 | } |
---|
| 498 | END_TRY |
---|
| 499 | |
---|
| 500 | fsd_log_return(("")); |
---|
| 501 | } |
---|
| 502 | |
---|
| 503 | void |
---|
[48] | 504 | pbsdrmaa_select_file( pbsdrmaa_log_reader_t * self ) |
---|
[7] | 505 | { |
---|
| 506 | pbsdrmaa_session_t *pbssession = (pbsdrmaa_session_t*) self->session; |
---|
| 507 | |
---|
[29] | 508 | if (self->date_changed) |
---|
| 509 | { |
---|
[7] | 510 | int num_tries = 0; |
---|
| 511 | struct tm tm; |
---|
| 512 | |
---|
| 513 | fsd_log_enter(("")); |
---|
| 514 | |
---|
| 515 | if(!self->first_open) |
---|
| 516 | time(&self->t); |
---|
| 517 | else |
---|
| 518 | self->t = pbssession->log_file_initial_time; |
---|
| 519 | |
---|
| 520 | localtime_r(&self->t,&tm); |
---|
| 521 | |
---|
| 522 | #define DRMAA_WAIT_THREAD_MAX_TRIES (12) |
---|
| 523 | /* generate new date, close file and open new */ |
---|
[48] | 524 | if (self->log_path) |
---|
| 525 | fsd_free(self->log_path); |
---|
[7] | 526 | |
---|
[48] | 527 | self->log_path = fsd_asprintf("%s/server_logs/%04d%02d%02d", pbssession->pbs_home, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); |
---|
| 528 | |
---|
[29] | 529 | if(self->fhandle) |
---|
| 530 | fclose(self->fhandle); |
---|
[7] | 531 | |
---|
[48] | 532 | fsd_log_info(("Opening log file: %s",self->log_path)); |
---|
[7] | 533 | |
---|
| 534 | retry: |
---|
[48] | 535 | if ((self->fhandle = fopen(self->log_path,"r")) == NULL && (num_tries > DRMAA_WAIT_THREAD_MAX_TRIES || self->first_open)) |
---|
[29] | 536 | { |
---|
[48] | 537 | fsd_log_error(("Can't open log file: %s. Verify pbs_home. Running standard wait_thread.", self->log_path)); |
---|
[29] | 538 | fsd_log_error(("Remember that without keep_completed set the standard wait_thread won't provide information about job exit status")); |
---|
[7] | 539 | /*pbssession->super.enable_wait_thread = false;*/ /* run not wait_thread */ |
---|
| 540 | pbssession->wait_thread_log = false; |
---|
| 541 | pbssession->super.wait_thread = pbssession->super_wait_thread; |
---|
| 542 | pbssession->super.wait_thread(self->session); |
---|
[29] | 543 | } |
---|
| 544 | else if ( self->fhandle == NULL ) |
---|
| 545 | { /* Torque seems not to create a new file immediately after the old one is closed */ |
---|
[48] | 546 | fsd_log_warning(("Can't open log file: %s. Retries count: %d", self->log_path, num_tries)); |
---|
[7] | 547 | num_tries++; |
---|
[29] | 548 | sleep(2 * num_tries); |
---|
[7] | 549 | goto retry; |
---|
[29] | 550 | } |
---|
[7] | 551 | |
---|
| 552 | fsd_log_debug(("Log file opened")); |
---|
| 553 | |
---|
[29] | 554 | if(self->first_open) |
---|
| 555 | { |
---|
[7] | 556 | fsd_log_debug(("Log file lseek")); |
---|
[29] | 557 | |
---|
| 558 | if(fseek(self->fhandle, pbssession->log_file_initial_size, SEEK_SET) == (off_t) -1) |
---|
| 559 | { |
---|
| 560 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"fseek error"); |
---|
| 561 | } |
---|
[7] | 562 | self->first_open = false; |
---|
[29] | 563 | } |
---|
[7] | 564 | |
---|
| 565 | self->date_changed = false; |
---|
| 566 | |
---|
| 567 | fsd_log_return(("")); |
---|
| 568 | } |
---|
| 569 | } |
---|
| 570 | |
---|
[29] | 571 | time_t |
---|
| 572 | pbsdrmaa_parse_log_timestamp(const char *timestamp, char *unixtime_str, size_t size) |
---|
[7] | 573 | { |
---|
[29] | 574 | struct tm temp_time_tm; |
---|
| 575 | memset(&temp_time_tm, 0, sizeof(temp_time_tm)); |
---|
| 576 | temp_time_tm.tm_isdst = -1; |
---|
[7] | 577 | |
---|
[29] | 578 | if (strptime(timestamp, "%m/%d/%Y %H:%M:%S", &temp_time_tm) == NULL) |
---|
| 579 | { |
---|
| 580 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"WT - failed to parse log timestamp: %s", timestamp); |
---|
| 581 | } |
---|
[7] | 582 | else |
---|
[29] | 583 | { |
---|
| 584 | time_t temp_time = mktime(&temp_time_tm); |
---|
| 585 | snprintf(unixtime_str, size, "%lu", temp_time); |
---|
| 586 | return temp_time; |
---|
| 587 | } |
---|
[7] | 588 | } |
---|
| 589 | |
---|
[29] | 590 | char * |
---|
| 591 | pbsdrmaa_get_exec_host_from_accountig(pbsdrmaa_log_reader_t * log_reader, const char *job_id) |
---|
[7] | 592 | { |
---|
[34] | 593 | pbsdrmaa_session_t *pbssession = (pbsdrmaa_session_t*) log_reader->session; |
---|
| 594 | struct tm tm; |
---|
| 595 | time_t tm_t; |
---|
| 596 | char *line = NULL; |
---|
[35] | 597 | char *exec_host = NULL; |
---|
| 598 | char *log_path = NULL; |
---|
[34] | 599 | FILE *fhandle = NULL; |
---|
| 600 | |
---|
[35] | 601 | fsd_log_enter(("(job_id=%s)", job_id)); |
---|
[34] | 602 | |
---|
| 603 | tm_t = time(NULL); |
---|
[35] | 604 | localtime_r(&tm_t, &tm); |
---|
[34] | 605 | |
---|
| 606 | log_path = fsd_asprintf("%s/server_priv/accounting/%04d%02d%02d", pbssession->pbs_home, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); |
---|
| 607 | |
---|
| 608 | fsd_log_info(("Opening accounting log file: %s", log_path)); |
---|
| 609 | |
---|
| 610 | if ((fhandle = fopen(log_path, "r")) == NULL) |
---|
| 611 | { |
---|
[35] | 612 | fsd_log_error(("Failed to open accounting log file: %s", log_path)); |
---|
[34] | 613 | fsd_free(log_path); |
---|
| 614 | return NULL; |
---|
| 615 | } |
---|
| 616 | |
---|
| 617 | fsd_free(log_path); |
---|
| 618 | /* |
---|
| 619 | 10/27/2011 14:09:32;E;114249.grass1.man.poznan.pl;user=drmaa group=drmaa jobname=none queue=shortq ctime=1319717371 qtime=1319717371 etime=1319717371 start=1319717372 owner=drmaa@grass1.man.poznan.pl exec_host=grass4.man.poznan.pl/0 Resource_List.neednodes=1 Resource_List.nodect=1 Resource_List.nodes=1 Resource_List.walltime=02:00:00 session=28561 end=1319717372 Exit_status=0 resources_used.cput=00:00:00 resources_used.mem=0kb resources_used.vmem=0kb resources_used.walltime=00:00:00 |
---|
| 620 | */ |
---|
| 621 | while ((line = fsd_readline(fhandle)) != NULL) |
---|
| 622 | { |
---|
[35] | 623 | |
---|
| 624 | if (line[20] == 'E' && strncmp(line + 22, job_id, strlen(job_id)) == 0 ) |
---|
[34] | 625 | { |
---|
| 626 | char *p = NULL; |
---|
| 627 | |
---|
| 628 | fsd_log_debug(("Matched accounting log record = %s", line)); |
---|
| 629 | |
---|
| 630 | if (!(exec_host = strstr(line, "exec_host"))) |
---|
| 631 | { |
---|
| 632 | fsd_log_error(("Invalid accounting record: %s", exec_host)); |
---|
| 633 | break; |
---|
| 634 | } |
---|
| 635 | |
---|
| 636 | exec_host += 10; |
---|
| 637 | |
---|
| 638 | p = exec_host; |
---|
[35] | 639 | while (*p != ' ' && *p != '\0') |
---|
[34] | 640 | p++; |
---|
[35] | 641 | *p = '\0'; |
---|
[34] | 642 | |
---|
| 643 | break; |
---|
| 644 | } |
---|
| 645 | |
---|
| 646 | fsd_free(line); |
---|
| 647 | } |
---|
| 648 | |
---|
| 649 | if (exec_host) |
---|
| 650 | { |
---|
| 651 | fsd_log_info(("Job %s was executing on hosts %s.", job_id, exec_host)); |
---|
| 652 | exec_host = fsd_strdup(exec_host); |
---|
| 653 | } |
---|
| 654 | else |
---|
| 655 | { |
---|
[35] | 656 | fsd_log_error(("Could not find executions hosts for %s.", job_id)); |
---|
[34] | 657 | } |
---|
| 658 | |
---|
| 659 | if (line) |
---|
| 660 | fsd_free(line); |
---|
| 661 | |
---|
| 662 | fclose(fhandle); |
---|
| 663 | |
---|
| 664 | return exec_host; |
---|
[7] | 665 | } |
---|
| 666 | |
---|
[48] | 667 | void |
---|
| 668 | pbsdrmaa_close_log( pbsdrmaa_log_reader_t * self ) |
---|
| 669 | { |
---|
[21] | 670 | |
---|
[48] | 671 | self->current_offset = ftello(self->fhandle); |
---|
[49] | 672 | |
---|
| 673 | fsd_log_debug(("Closing log file (offset=%d)", self->current_offset)); |
---|
[48] | 674 | |
---|
| 675 | fclose(self->fhandle); |
---|
| 676 | |
---|
| 677 | self->fhandle = NULL; |
---|
| 678 | } |
---|
| 679 | |
---|
| 680 | void |
---|
| 681 | pbsdrmaa_reopen_log( pbsdrmaa_log_reader_t * self ) |
---|
| 682 | { |
---|
[49] | 683 | fsd_log_debug(("Reopening log file: %s (offset=%d)", self->log_path, self->current_offset)); |
---|
| 684 | |
---|
[48] | 685 | if ((self->fhandle = fopen(self->log_path,"r")) == NULL) |
---|
| 686 | { |
---|
| 687 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"Failed to reopen log file"); |
---|
| 688 | } |
---|
| 689 | |
---|
| 690 | if(fseek(self->fhandle, self->current_offset, SEEK_SET) == (off_t) -1) |
---|
| 691 | { |
---|
| 692 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"fseek error"); |
---|
| 693 | } |
---|
| 694 | } |
---|
| 695 | |
---|