[12] | 1 | /* $Id$ */ |
---|
[7] | 2 | /* |
---|
| 3 | * FedStage DRMAA for PBS Pro |
---|
| 4 | * Copyright (C) 2006-2007 FedStage Systems |
---|
| 5 | * |
---|
| 6 | * This program is free software: you can redistribute it and/or modify |
---|
| 7 | * it under the terms of the GNU General Public License as published by |
---|
| 8 | * the Free Software Foundation, either version 3 of the License, or |
---|
| 9 | * (at your option) any later version. |
---|
| 10 | * |
---|
| 11 | * This program is distributed in the hope that it will be useful, |
---|
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 14 | * GNU General Public License for more details. |
---|
| 15 | * |
---|
| 16 | * You should have received a copy of the GNU General Public License |
---|
| 17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
| 18 | */ |
---|
| 19 | |
---|
| 20 | #ifdef HAVE_CONFIG_H |
---|
| 21 | # include <config.h> |
---|
| 22 | #endif |
---|
| 23 | |
---|
| 24 | #include <stdlib.h> |
---|
[35] | 25 | #include <ctype.h> |
---|
[7] | 26 | #include <string.h> |
---|
| 27 | #include <unistd.h> |
---|
| 28 | #include <sys/stat.h> |
---|
| 29 | #include <sys/types.h> |
---|
| 30 | #include <dirent.h> |
---|
| 31 | #include <fcntl.h> |
---|
| 32 | |
---|
| 33 | #include <pbs_ifl.h> |
---|
| 34 | #include <pbs_error.h> |
---|
| 35 | |
---|
| 36 | #include <drmaa_utils/datetime.h> |
---|
| 37 | #include <drmaa_utils/drmaa.h> |
---|
| 38 | #include <drmaa_utils/iter.h> |
---|
| 39 | #include <drmaa_utils/conf.h> |
---|
| 40 | #include <drmaa_utils/session.h> |
---|
| 41 | #include <drmaa_utils/datetime.h> |
---|
| 42 | |
---|
| 43 | #include <pbs_drmaa/job.h> |
---|
| 44 | #include <pbs_drmaa/log_reader.h> |
---|
| 45 | #include <pbs_drmaa/session.h> |
---|
| 46 | #include <pbs_drmaa/submit.h> |
---|
| 47 | #include <pbs_drmaa/util.h> |
---|
[29] | 48 | #include <pbs_drmaa/pbs_attrib.h> |
---|
[7] | 49 | |
---|
| 50 | #include <errno.h> |
---|
| 51 | |
---|
[29] | 52 | enum pbsdrmaa_field_id |
---|
| 53 | { |
---|
| 54 | PBSDRMAA_FLD_ID_DATE = 0, |
---|
| 55 | PBSDRMAA_FLD_ID_EVENT = 1, |
---|
| 56 | PBSDRMAA_FLD_ID_SRC = 2, |
---|
| 57 | PBSDRMAA_FLD_ID_OBJ_TYPE = 3, |
---|
| 58 | PBSDRMAA_FLD_ID_OBJ_ID = 4, |
---|
| 59 | PBSDRMAA_FLD_ID_MSG = 5 |
---|
| 60 | }; |
---|
[7] | 61 | |
---|
| 62 | |
---|
[29] | 63 | #define PBSDRMAA_FLD_MSG_0008 "0008" |
---|
| 64 | #define PBSDRMAA_FLD_MSG_0010 "0010" |
---|
[7] | 65 | |
---|
[29] | 66 | enum pbsdrmaa_event_type |
---|
| 67 | { |
---|
| 68 | pbsdrmaa_event_0008 = 8, |
---|
| 69 | pbsdrmaa_event_0010 = 10 |
---|
| 70 | }; |
---|
[7] | 71 | |
---|
[29] | 72 | static void pbsdrmaa_read_log(); |
---|
[7] | 73 | |
---|
[29] | 74 | static void pbsdrmaa_select_file_wait_thread( pbsdrmaa_log_reader_t * self); |
---|
[21] | 75 | |
---|
[29] | 76 | char *pbsdrmaa_read_line_wait_thread( pbsdrmaa_log_reader_t * self); |
---|
[21] | 77 | |
---|
[29] | 78 | static time_t pbsdrmaa_parse_log_timestamp(const char *timestamp, char *unixtime_str, size_t size); |
---|
[21] | 79 | |
---|
[29] | 80 | static char *pbsdrmaa_get_exec_host_from_accountig(pbsdrmaa_log_reader_t * log_reader, const char *job_id); |
---|
[7] | 81 | |
---|
[25] | 82 | /* |
---|
| 83 | * Snippets from log files |
---|
| 84 | * |
---|
| 85 | * PBS Pro |
---|
| 86 | * |
---|
| 87 | 10/11/2011 14:43:29;0008;Server@nova;Job;2127218.nova;Job Queued at request of mamonski@endor.wcss.wroc.pl, owner = mamonski@endor.wcss.wroc.pl, job name = STDIN, queue = normal |
---|
| 88 | 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Modified at request of Scheduler@nova.wcss.wroc.pl |
---|
| 89 | 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Run at request of Scheduler@nova.wcss.wroc.pl on exec_vnode (wn698:ncpus=3:mem=2048000kb)+(wn700:ncpus=3:mem=2048000kb) |
---|
| 90 | 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Modified at request of Scheduler@nova.wcss.wroc.pl |
---|
| 91 | 10/11/2011 14:43:32;0010;Server@nova;Job;2127218.nova;Exit_status=0 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=1768kb resources_used.ncpus=6 resources_used.vmem=19228kb resources_used.walltime=00:00:01 |
---|
| 92 | |
---|
| 93 | * |
---|
| 94 | * Torque |
---|
| 95 | * |
---|
| 96 | 10/11/2011 14:47:59;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Queued at request of plgmamonski@ui.cyf-kr.edu.pl, owner = plgmamonski@ui.cyf-kr.edu.pl, job name = STDIN, queue = l_short |
---|
| 97 | 10/11/2011 14:48:23;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Run at request of root@batch.grid.cyf-kr.edu.pl |
---|
| 98 | 10/11/2011 14:48:24;0010;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Exit_status=0 resources_used.cput=00:00:00 resources_used.mem=720kb resources_used.vmem=13308kb resources_used.walltime=00:00:00 |
---|
| 99 | |
---|
[29] | 100 | deleting job: |
---|
| 101 | I . PBS Pro |
---|
| 102 | a) in Q state |
---|
| 103 | 10/16/2011 09:49:25;0008;Server@grass1;Job;2178.grass1.man.poznan.pl;Job Queued at request of mmamonski@grass1.man.poznan.pl, owner = mmamonski@grass1.man.poznan.pl, job name = STDIN, queue = workq |
---|
| 104 | 10/16/2011 09:49:25;0008;Server@grass1;Job;2178.grass1.man.poznan.pl;Job Modified at request of Scheduler@grass1.man.poznan.pl |
---|
| 105 | 10/16/2011 09:49:37;0008;Server@grass1;Job;2178.grass1.man.poznan.pl;Job to be deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 106 | 10/16/2011 09:49:37;0100;Server@grass1;Job;2178.grass1.man.poznan.pl;dequeuing from workq, state 5 |
---|
| 107 | |
---|
| 108 | |
---|
| 109 | b) in R state |
---|
| 110 | 10/16/2011 09:45:12;0080;Server@grass1;Job;2177.grass1.man.poznan.pl;delete job request received |
---|
| 111 | 10/16/2011 09:45:12;0008;Server@grass1;Job;2177.grass1.man.poznan.pl;Job sent signal TermJob on delete |
---|
| 112 | 10/16/2011 09:45:12;0008;Server@grass1;Job;2177.grass1.man.poznan.pl;Job to be deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 113 | 10/16/2011 09:45:12;0010;Server@grass1;Job;2177.grass1.man.poznan.pl;Exit_status=271 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=2772kb resources_used.ncpus=1 resources_used.vmem=199288kb resources_used.walltime=00:00:26 |
---|
| 114 | 10/16/2011 09:45:12;0100;Server@grass1;Job;2177.grass1.man.poznan.pl;dequeuing from workq, state 5 |
---|
| 115 | |
---|
| 116 | II. Torque |
---|
| 117 | a) in Q state |
---|
| 118 | 10/15/2011 21:19:25;0008;PBS_Server;Job;113045.grass1.man.poznan.pl;Job deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 119 | 10/15/2011 21:19:25;0100;PBS_Server;Job;113045.grass1.man.poznan.pl;dequeuing from batch, state EXITING |
---|
| 120 | |
---|
| 121 | b) in R state |
---|
| 122 | 10/15/2011 21:19:47;0008;PBS_Server;Job;113046.grass1.man.poznan.pl;Job deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 123 | 10/15/2011 21:19:47;0008;PBS_Server;Job;113046.grass1.man.poznan.pl;Job sent signal SIGTERM on delete |
---|
| 124 | 10/15/2011 21:19:47;0010;PBS_Server;Job;113046.grass1.man.poznan.pl;Exit_status=271 resources_used.cput=00:00:00 resources_used.mem=0kb resources_used.vmem=0kb resources_used.walltime=00:00:10 |
---|
| 125 | |
---|
| 126 | Log closed: |
---|
| 127 | 10/16/2011 00:00:17;0002;PBS_Server;Svr;Log;Log closed |
---|
| 128 | |
---|
[25] | 129 | */ |
---|
[7] | 130 | pbsdrmaa_log_reader_t * |
---|
[29] | 131 | pbsdrmaa_log_reader_new( fsd_drmaa_session_t *session ) |
---|
[7] | 132 | { |
---|
| 133 | pbsdrmaa_log_reader_t *volatile self = NULL; |
---|
| 134 | |
---|
| 135 | fsd_log_enter(("")); |
---|
[29] | 136 | |
---|
[7] | 137 | TRY |
---|
| 138 | { |
---|
| 139 | fsd_malloc(self, pbsdrmaa_log_reader_t ); |
---|
| 140 | |
---|
| 141 | self->session = session; |
---|
[29] | 142 | |
---|
| 143 | self->select_file = pbsdrmaa_select_file_wait_thread; |
---|
[7] | 144 | self->read_log = pbsdrmaa_read_log; |
---|
| 145 | |
---|
| 146 | self->run_flag = true; |
---|
[29] | 147 | self->fhandle = NULL; |
---|
[7] | 148 | self->date_changed = true; |
---|
| 149 | self->first_open = true; |
---|
| 150 | |
---|
| 151 | } |
---|
| 152 | EXCEPT_DEFAULT |
---|
| 153 | { |
---|
| 154 | if( self != NULL) |
---|
| 155 | fsd_free(self); |
---|
| 156 | |
---|
| 157 | fsd_exc_reraise(); |
---|
| 158 | } |
---|
| 159 | END_TRY |
---|
[29] | 160 | |
---|
[7] | 161 | fsd_log_return(("")); |
---|
[29] | 162 | |
---|
[7] | 163 | return self; |
---|
| 164 | } |
---|
| 165 | |
---|
[21] | 166 | |
---|
[7] | 167 | void |
---|
| 168 | pbsdrmaa_log_reader_destroy ( pbsdrmaa_log_reader_t * self ) |
---|
| 169 | { |
---|
| 170 | fsd_log_enter(("")); |
---|
| 171 | TRY |
---|
| 172 | { |
---|
| 173 | if(self != NULL) |
---|
| 174 | { |
---|
| 175 | fsd_free(self); |
---|
[29] | 176 | } |
---|
[7] | 177 | } |
---|
| 178 | EXCEPT_DEFAULT |
---|
| 179 | { |
---|
| 180 | fsd_exc_reraise(); |
---|
| 181 | } |
---|
| 182 | END_TRY |
---|
| 183 | |
---|
| 184 | fsd_log_return(("")); |
---|
| 185 | } |
---|
| 186 | |
---|
| 187 | |
---|
[29] | 188 | void |
---|
[7] | 189 | pbsdrmaa_read_log( pbsdrmaa_log_reader_t * self ) |
---|
| 190 | { |
---|
| 191 | fsd_log_enter(("")); |
---|
| 192 | |
---|
[29] | 193 | fsd_mutex_lock( &self->session->mutex ); |
---|
[8] | 194 | |
---|
[7] | 195 | TRY |
---|
[29] | 196 | { |
---|
[7] | 197 | while( self->run_flag ) |
---|
[29] | 198 | { |
---|
| 199 | TRY |
---|
[7] | 200 | { |
---|
[29] | 201 | char *line = NULL; |
---|
[8] | 202 | |
---|
[29] | 203 | self->select_file(self); |
---|
[7] | 204 | |
---|
[29] | 205 | while ((line = fsd_readline(self->fhandle)) != NULL) |
---|
| 206 | { |
---|
| 207 | int field_id = PBSDRMAA_FLD_ID_DATE; |
---|
| 208 | char *tok_ctx = NULL; |
---|
| 209 | char *field_token = NULL; |
---|
| 210 | char *event_timestamp = NULL; |
---|
| 211 | int event_type = -1; |
---|
| 212 | fsd_job_t *job = NULL; |
---|
[7] | 213 | |
---|
[29] | 214 | /* at first detect if this not the end of log file */ |
---|
| 215 | if (strstr(line, "Log;Log closed")) /*TODO try to be more effective and safe */ |
---|
| 216 | { |
---|
| 217 | fsd_log_debug(("WT - Date changed. Closing log file")); |
---|
| 218 | self->date_changed = true; |
---|
[31] | 219 | goto cleanup; |
---|
[29] | 220 | } |
---|
[7] | 221 | |
---|
[29] | 222 | for (field_token = strtok_r(line, ";", &tok_ctx); field_token; field_token = strtok_r(NULL, ";", &tok_ctx), field_id++) |
---|
| 223 | { |
---|
| 224 | if ( field_id == PBSDRMAA_FLD_ID_DATE) |
---|
| 225 | { |
---|
| 226 | event_timestamp = field_token; |
---|
| 227 | } |
---|
| 228 | else if ( field_id == PBSDRMAA_FLD_ID_EVENT) |
---|
| 229 | { |
---|
| 230 | if (strncmp(field_token, PBSDRMAA_FLD_MSG_0008, 4) == 0) |
---|
| 231 | event_type = pbsdrmaa_event_0008; |
---|
| 232 | else if (strncmp(field_token, PBSDRMAA_FLD_MSG_0010, 4) == 0) |
---|
| 233 | event_type = pbsdrmaa_event_0010; |
---|
| 234 | else |
---|
[30] | 235 | { |
---|
[31] | 236 | goto cleanup; /*we are interested only in the above log messages */ |
---|
[30] | 237 | } |
---|
[29] | 238 | } |
---|
| 239 | else if ( field_id == PBSDRMAA_FLD_ID_SRC) |
---|
| 240 | { |
---|
| 241 | /* not used ignore */ |
---|
| 242 | } |
---|
| 243 | else if (field_id == PBSDRMAA_FLD_ID_OBJ_TYPE) |
---|
| 244 | { |
---|
| 245 | if (strncmp(field_token, "Job", 3) != 0) |
---|
[30] | 246 | { |
---|
[31] | 247 | goto cleanup; /* we are interested only in job events */ |
---|
[30] | 248 | } |
---|
[29] | 249 | } |
---|
| 250 | else if (field_id == PBSDRMAA_FLD_ID_OBJ_ID) |
---|
| 251 | { |
---|
| 252 | const char *event_jobid = field_token; |
---|
[31] | 253 | |
---|
| 254 | if (!isdigit(event_jobid[0])) |
---|
| 255 | { |
---|
| 256 | fsd_log_debug(("WT - Invalid job: %s", event_jobid)); |
---|
| 257 | goto cleanup; |
---|
| 258 | } |
---|
[26] | 259 | |
---|
[31] | 260 | job = self->session->get_job( self->session, event_jobid ); |
---|
| 261 | |
---|
| 262 | if( job ) |
---|
[25] | 263 | { |
---|
[31] | 264 | fsd_log_debug(("WT - Found job event: %s", event_jobid)); |
---|
[29] | 265 | } |
---|
[31] | 266 | else |
---|
| 267 | { |
---|
| 268 | fsd_log_debug(("WT - Unknown job: %s", event_jobid)); /* Not a DRMAA job */ |
---|
| 269 | goto cleanup; |
---|
| 270 | } |
---|
| 271 | } |
---|
[29] | 272 | else if (field_id == PBSDRMAA_FLD_ID_MSG) |
---|
| 273 | { |
---|
| 274 | char *msg = field_token; |
---|
| 275 | struct batch_status status; |
---|
| 276 | struct attrl *attribs = NULL; |
---|
| 277 | bool in_running_state = false; |
---|
[7] | 278 | |
---|
[29] | 279 | if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job Queued", 10) == 0) |
---|
| 280 | { |
---|
| 281 | /* Queued |
---|
| 282 | * PBS Pro: 10/11/2011 14:43:29;0008;Server@nova;Job;2127218.nova;Job Queued at request of mamonski@endor.wcss.wroc.pl, owner = mamonski@endor.wcss.wroc.pl, job name = STDIN, queue = normal |
---|
| 283 | * Torque: 10/11/2011 14:47:59;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Queued at request of plgmamonski@ui.cyf-kr.edu.pl, owner = plgmamonski@ui.cyf-kr.edu.pl, job name = STDIN, queue = l_short |
---|
| 284 | */ |
---|
| 285 | char *p_queue = NULL; |
---|
[25] | 286 | |
---|
[31] | 287 | fsd_log_info(("WT - Detected queuing of job %s", job->job_id)); |
---|
[30] | 288 | |
---|
[29] | 289 | if ((p_queue = strstr(msg,"queue =")) == NULL) |
---|
| 290 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"No queue attribute found in log line = %s", line); |
---|
[25] | 291 | |
---|
[30] | 292 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "Q"); |
---|
| 293 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_QUEUE, p_queue + 7); |
---|
| 294 | } |
---|
| 295 | else if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job Run", 7) == 0) |
---|
| 296 | { |
---|
| 297 | /* |
---|
| 298 | * Running |
---|
| 299 | * Torque: 10/11/2011 14:48:23;0008;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Job Run at request of root@batch.grid.cyf-kr.edu.pl |
---|
| 300 | * PBS Pro: 10/11/2011 14:43:31;0008;Server@nova;Job;2127218.nova;Job Run at request of Scheduler@nova.wcss.wroc.pl on exec_vnode (wn698:ncpus=3:mem=2048000kb)+(wn700:ncpus=3:mem=2048000kb) |
---|
| 301 | */ |
---|
| 302 | char timestamp_unix[64]; |
---|
| 303 | |
---|
| 304 | fsd_log_info(("WT - Detected start of job %s", job->job_id)); |
---|
| 305 | |
---|
[29] | 306 | (void)pbsdrmaa_parse_log_timestamp(event_timestamp, timestamp_unix, sizeof(timestamp_unix)); |
---|
| 307 | |
---|
| 308 | in_running_state = true; |
---|
| 309 | |
---|
| 310 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "R"); |
---|
| 311 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_START_TIME, timestamp_unix); |
---|
[26] | 312 | #ifdef PBS_PROFESSIONAL |
---|
| 313 | { |
---|
| 314 | char *p_vnode = NULL; |
---|
[32] | 315 | if ((p_vnode = strstr(msg, "exec_vnode"))) |
---|
[26] | 316 | { |
---|
[32] | 317 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXECUTION_VNODE, p_vnode + 11); |
---|
[26] | 318 | } |
---|
| 319 | } |
---|
| 320 | #endif |
---|
| 321 | } |
---|
[36] | 322 | #ifndef PBS_PROFESSIONAL |
---|
[33] | 323 | else if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job deleted", 11) == 0) |
---|
[29] | 324 | #else |
---|
[33] | 325 | else if (event_type == pbsdrmaa_event_0008 && strncmp(msg, "Job to be deleted", 17) == 0) |
---|
[29] | 326 | #endif |
---|
| 327 | { |
---|
| 328 | /* Deleted |
---|
| 329 | * PBS Pro: 10/16/2011 09:45:12;0008;Server@grass1;Job;2177.grass1.man.poznan.pl;Job to be deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 330 | * Torque: 10/15/2011 21:19:25;0008;PBS_Server;Job;113045.grass1.man.poznan.pl;Job deleted at request of mmamonski@grass1.man.poznan.pl |
---|
| 331 | */ |
---|
| 332 | char timestamp_unix[64]; |
---|
[7] | 333 | |
---|
[30] | 334 | fsd_log_info(("WT - Detected deletion of job %s", job->job_id)); |
---|
| 335 | |
---|
[29] | 336 | (void)pbsdrmaa_parse_log_timestamp(event_timestamp, timestamp_unix, sizeof(timestamp_unix)); |
---|
[7] | 337 | |
---|
[29] | 338 | if (job->state < DRMAA_PS_RUNNING) |
---|
| 339 | { |
---|
[33] | 340 | fsd_log_info(("WT - Job %s killed before entering running state (%d).", job->job_id, job->state)); |
---|
[29] | 341 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "C"); |
---|
| 342 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_MTIME, timestamp_unix); |
---|
| 343 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXIT_STATUS, "-2"); |
---|
| 344 | } |
---|
| 345 | else |
---|
| 346 | { |
---|
[36] | 347 | fsd_log_info(("WT - Job %s killed after entering running state (%d). Waiting for Completed event...", job->job_id, job->state)); |
---|
[31] | 348 | goto cleanup; /* job was started, ignore, wait for Exit_status message */ |
---|
[29] | 349 | } |
---|
| 350 | } |
---|
[31] | 351 | else if (event_type == pbsdrmaa_event_0010 && (strncmp(msg, "Exit_status=", 12) == 0)) |
---|
[29] | 352 | { |
---|
| 353 | /* Completed: |
---|
| 354 | * PBS Pro: 10/11/2011 14:43:32;0010;Server@nova;Job;2127218.nova;Exit_status=0 resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=1768kb resources_used.ncpus=6 resources_used.vmem=19228kb resources_used.walltime=00:00:01 |
---|
| 355 | * Torque: 10/11/2011 14:48:24;0010;PBS_Server;Job;15545337.batch.grid.cyf-kr.edu.pl;Exit_status=0 resources_used.cput=00:00:00 resources_used.mem=720kb resources_used.vmem=13308kb resources_used.walltime=00:00:00 |
---|
| 356 | */ |
---|
| 357 | char timestamp_unix[64]; |
---|
| 358 | time_t timestamp_time_t = pbsdrmaa_parse_log_timestamp(event_timestamp, timestamp_unix, sizeof(timestamp_unix)); |
---|
| 359 | char *tok_ctx2 = NULL; |
---|
| 360 | char *token = NULL; |
---|
[7] | 361 | |
---|
[29] | 362 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_JOB_STATE, "C"); |
---|
| 363 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_MTIME, timestamp_unix); |
---|
[7] | 364 | |
---|
[29] | 365 | /* tokenize !!! */ |
---|
| 366 | for (token = strtok_r(msg, " ", &tok_ctx2); token; token = strtok_r(NULL, " ", &tok_ctx2)) |
---|
| 367 | { |
---|
| 368 | if (strncmp(token, "Exit_status=", 12) == 0) |
---|
| 369 | { |
---|
[31] | 370 | token[11] = '\0'; |
---|
[33] | 371 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXIT_STATUS, token + 12); |
---|
[29] | 372 | fsd_log_info(("WT - Completion of job %s (Exit_status=%s) detected after %d seconds", job->job_id, token+12, (int)(time(NULL) - timestamp_time_t) )); |
---|
| 373 | } |
---|
| 374 | else if (strncmp(token, "resources_used.cput=", 20) == 0) |
---|
| 375 | { |
---|
[31] | 376 | token[19] = '\0'; |
---|
[29] | 377 | attribs = pbsdrmaa_add_attr(attribs, token, token + 20); |
---|
| 378 | } |
---|
| 379 | else if (strncmp(token, "resources_used.mem=", 19) == 0) |
---|
| 380 | { |
---|
[31] | 381 | token[18] = '\0'; |
---|
[29] | 382 | attribs = pbsdrmaa_add_attr(attribs, token, token + 19); |
---|
| 383 | } |
---|
| 384 | else if (strncmp(token, "resources_used.vmem=", 20) == 0) |
---|
| 385 | { |
---|
[31] | 386 | token[19] = '\0'; |
---|
[29] | 387 | attribs = pbsdrmaa_add_attr(attribs, token, token + 20); |
---|
| 388 | } |
---|
| 389 | else if (strncmp(token, "resources_used.walltime=", 24) == 0) |
---|
| 390 | { |
---|
[31] | 391 | token[23] = '\0'; |
---|
[29] | 392 | attribs = pbsdrmaa_add_attr(attribs, token, token + 24); |
---|
| 393 | } |
---|
| 394 | } |
---|
[7] | 395 | |
---|
[29] | 396 | if (!job->execution_hosts) |
---|
| 397 | { |
---|
| 398 | char *exec_host = NULL; |
---|
| 399 | fsd_log_info(("WT - No execution host information for job %s. Reading accounting logs...", job->job_id)); |
---|
| 400 | exec_host = pbsdrmaa_get_exec_host_from_accountig(self, job->job_id); |
---|
[34] | 401 | if (exec_host) |
---|
| 402 | { |
---|
| 403 | attribs = pbsdrmaa_add_attr(attribs, PBSDRMAA_EXECUTION_HOST, exec_host); |
---|
| 404 | fsd_free(exec_host); |
---|
| 405 | } |
---|
[29] | 406 | } |
---|
| 407 | } |
---|
| 408 | else |
---|
| 409 | { |
---|
[36] | 410 | fsd_log_debug(("Ignoring msg(type=%d) = %s", event_type, msg)); |
---|
[31] | 411 | goto cleanup; /* ignore other job events*/ |
---|
[29] | 412 | } |
---|
[35] | 413 | |
---|
| 414 | fsd_log_debug(("WT - updating job: %s", job->job_id )); |
---|
| 415 | status.name = job->job_id; |
---|
| 416 | status.attribs = attribs; |
---|
[25] | 417 | |
---|
[35] | 418 | ((pbsdrmaa_job_t *)job)->update( job, &status ); |
---|
| 419 | |
---|
[29] | 420 | if ( in_running_state ) |
---|
| 421 | { |
---|
| 422 | fsd_log_debug(("WT - forcing update of job: %s", job->job_id )); |
---|
[18] | 423 | TRY |
---|
| 424 | { |
---|
[29] | 425 | job->update_status( job ); |
---|
[18] | 426 | } |
---|
| 427 | EXCEPT_DEFAULT |
---|
| 428 | { |
---|
| 429 | /*TODO: distinguish between invalid job and internal errors */ |
---|
[29] | 430 | fsd_log_debug(("Job finished just after entering running state: %s", job->job_id)); |
---|
[18] | 431 | } |
---|
| 432 | END_TRY |
---|
[29] | 433 | } |
---|
| 434 | |
---|
| 435 | |
---|
[35] | 436 | pbsdrmaa_free_attrl(attribs); /* TODO free on exception */ |
---|
[29] | 437 | |
---|
| 438 | fsd_cond_broadcast( &job->status_cond); |
---|
[7] | 439 | fsd_cond_broadcast( &self->session->wait_condition ); |
---|
| 440 | |
---|
[29] | 441 | } |
---|
| 442 | else |
---|
| 443 | { |
---|
| 444 | fsd_assert(0); /*not reached */ |
---|
| 445 | } |
---|
| 446 | } |
---|
[31] | 447 | cleanup: |
---|
| 448 | fsd_free(line); /* TODO what about exceptions */ |
---|
| 449 | if ( job ) |
---|
| 450 | job->release( job ); |
---|
[29] | 451 | |
---|
[31] | 452 | |
---|
| 453 | |
---|
[29] | 454 | } /* end of while getline loop */ |
---|
| 455 | |
---|
[30] | 456 | |
---|
[29] | 457 | |
---|
[33] | 458 | fsd_mutex_unlock( &self->session->mutex ); |
---|
| 459 | |
---|
[38] | 460 | usleep(300000); /* 300 ms - consider using inotify - but this would not work with NFS */ |
---|
[7] | 461 | |
---|
[33] | 462 | fsd_mutex_lock( &self->session->mutex ); |
---|
[19] | 463 | |
---|
[33] | 464 | self->run_flag = self->session->wait_thread_run_flag; |
---|
[7] | 465 | } |
---|
[29] | 466 | EXCEPT_DEFAULT |
---|
| 467 | { |
---|
| 468 | const fsd_exc_t *e = fsd_exc_get(); |
---|
| 469 | /* Its better to exit and communicate error rather then let the application to hang */ |
---|
| 470 | fsd_log_fatal(( "Exception in wait thread: <%d:%s>. Exiting !!!", e->code(e), e->message(e) )); |
---|
| 471 | exit(1); |
---|
| 472 | } |
---|
| 473 | END_TRY |
---|
| 474 | } |
---|
[7] | 475 | |
---|
[29] | 476 | if(self->fhandle) |
---|
| 477 | fclose(self->fhandle); |
---|
| 478 | |
---|
| 479 | fsd_log_debug(("WT - Log file closed")); |
---|
[7] | 480 | } |
---|
| 481 | FINALLY |
---|
| 482 | { |
---|
[29] | 483 | fsd_log_debug(("WT - Terminated.")); |
---|
| 484 | fsd_mutex_unlock( &self->session->mutex ); /**/ |
---|
[7] | 485 | } |
---|
| 486 | END_TRY |
---|
| 487 | |
---|
| 488 | fsd_log_return(("")); |
---|
| 489 | } |
---|
| 490 | |
---|
| 491 | void |
---|
[8] | 492 | pbsdrmaa_select_file_wait_thread ( pbsdrmaa_log_reader_t * self ) |
---|
[7] | 493 | { |
---|
| 494 | pbsdrmaa_session_t *pbssession = (pbsdrmaa_session_t*) self->session; |
---|
| 495 | |
---|
[29] | 496 | if (self->date_changed) |
---|
| 497 | { |
---|
[7] | 498 | char * log_path = NULL; |
---|
| 499 | int num_tries = 0; |
---|
| 500 | struct tm tm; |
---|
| 501 | |
---|
| 502 | fsd_log_enter(("")); |
---|
| 503 | |
---|
| 504 | if(!self->first_open) |
---|
| 505 | time(&self->t); |
---|
| 506 | else |
---|
| 507 | self->t = pbssession->log_file_initial_time; |
---|
| 508 | |
---|
| 509 | localtime_r(&self->t,&tm); |
---|
| 510 | |
---|
| 511 | #define DRMAA_WAIT_THREAD_MAX_TRIES (12) |
---|
| 512 | /* generate new date, close file and open new */ |
---|
[29] | 513 | log_path = fsd_asprintf("%s/server_logs/%04d%02d%02d", pbssession->pbs_home, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); |
---|
[7] | 514 | |
---|
[29] | 515 | if(self->fhandle) |
---|
| 516 | fclose(self->fhandle); |
---|
[7] | 517 | |
---|
[29] | 518 | fsd_log_info(("Opening log file: %s",log_path)); |
---|
[7] | 519 | |
---|
| 520 | retry: |
---|
[31] | 521 | if ((self->fhandle = fopen(log_path,"r")) == NULL && (num_tries > DRMAA_WAIT_THREAD_MAX_TRIES || self->first_open)) |
---|
[29] | 522 | { |
---|
[7] | 523 | fsd_log_error(("Can't open log file. Verify pbs_home. Running standard wait_thread.")); |
---|
[29] | 524 | fsd_log_error(("Remember that without keep_completed set the standard wait_thread won't provide information about job exit status")); |
---|
[7] | 525 | /*pbssession->super.enable_wait_thread = false;*/ /* run not wait_thread */ |
---|
| 526 | pbssession->wait_thread_log = false; |
---|
| 527 | pbssession->super.wait_thread = pbssession->super_wait_thread; |
---|
| 528 | pbssession->super.wait_thread(self->session); |
---|
[29] | 529 | } |
---|
| 530 | else if ( self->fhandle == NULL ) |
---|
| 531 | { /* Torque seems not to create a new file immediately after the old one is closed */ |
---|
[7] | 532 | fsd_log_warning(("Can't open log file: %s. Retries count: %d", log_path, num_tries)); |
---|
| 533 | num_tries++; |
---|
[29] | 534 | sleep(2 * num_tries); |
---|
[7] | 535 | goto retry; |
---|
[29] | 536 | } |
---|
[7] | 537 | |
---|
| 538 | fsd_free(log_path); |
---|
| 539 | |
---|
| 540 | fsd_log_debug(("Log file opened")); |
---|
| 541 | |
---|
[29] | 542 | if(self->first_open) |
---|
| 543 | { |
---|
[7] | 544 | fsd_log_debug(("Log file lseek")); |
---|
[29] | 545 | |
---|
| 546 | if(fseek(self->fhandle, pbssession->log_file_initial_size, SEEK_SET) == (off_t) -1) |
---|
| 547 | { |
---|
| 548 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"fseek error"); |
---|
| 549 | } |
---|
[7] | 550 | self->first_open = false; |
---|
[29] | 551 | } |
---|
[7] | 552 | |
---|
| 553 | self->date_changed = false; |
---|
| 554 | |
---|
| 555 | fsd_log_return(("")); |
---|
| 556 | } |
---|
| 557 | } |
---|
| 558 | |
---|
[29] | 559 | time_t |
---|
| 560 | pbsdrmaa_parse_log_timestamp(const char *timestamp, char *unixtime_str, size_t size) |
---|
[7] | 561 | { |
---|
[29] | 562 | struct tm temp_time_tm; |
---|
| 563 | memset(&temp_time_tm, 0, sizeof(temp_time_tm)); |
---|
| 564 | temp_time_tm.tm_isdst = -1; |
---|
[7] | 565 | |
---|
[29] | 566 | if (strptime(timestamp, "%m/%d/%Y %H:%M:%S", &temp_time_tm) == NULL) |
---|
| 567 | { |
---|
| 568 | fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"WT - failed to parse log timestamp: %s", timestamp); |
---|
| 569 | } |
---|
[7] | 570 | else |
---|
[29] | 571 | { |
---|
| 572 | time_t temp_time = mktime(&temp_time_tm); |
---|
| 573 | snprintf(unixtime_str, size, "%lu", temp_time); |
---|
| 574 | return temp_time; |
---|
| 575 | } |
---|
[7] | 576 | } |
---|
| 577 | |
---|
[29] | 578 | char * |
---|
| 579 | pbsdrmaa_get_exec_host_from_accountig(pbsdrmaa_log_reader_t * log_reader, const char *job_id) |
---|
[7] | 580 | { |
---|
[34] | 581 | pbsdrmaa_session_t *pbssession = (pbsdrmaa_session_t*) log_reader->session; |
---|
| 582 | struct tm tm; |
---|
| 583 | time_t tm_t; |
---|
| 584 | char *line = NULL; |
---|
[35] | 585 | char *exec_host = NULL; |
---|
| 586 | char *log_path = NULL; |
---|
[34] | 587 | FILE *fhandle = NULL; |
---|
| 588 | |
---|
[35] | 589 | fsd_log_enter(("(job_id=%s)", job_id)); |
---|
[34] | 590 | |
---|
| 591 | tm_t = time(NULL); |
---|
[35] | 592 | localtime_r(&tm_t, &tm); |
---|
[34] | 593 | |
---|
| 594 | log_path = fsd_asprintf("%s/server_priv/accounting/%04d%02d%02d", pbssession->pbs_home, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); |
---|
| 595 | |
---|
| 596 | fsd_log_info(("Opening accounting log file: %s", log_path)); |
---|
| 597 | |
---|
| 598 | if ((fhandle = fopen(log_path, "r")) == NULL) |
---|
| 599 | { |
---|
[35] | 600 | fsd_log_error(("Failed to open accounting log file: %s", log_path)); |
---|
[34] | 601 | fsd_free(log_path); |
---|
| 602 | return NULL; |
---|
| 603 | } |
---|
| 604 | |
---|
| 605 | fsd_free(log_path); |
---|
| 606 | /* |
---|
| 607 | 10/27/2011 14:09:32;E;114249.grass1.man.poznan.pl;user=drmaa group=drmaa jobname=none queue=shortq ctime=1319717371 qtime=1319717371 etime=1319717371 start=1319717372 owner=drmaa@grass1.man.poznan.pl exec_host=grass4.man.poznan.pl/0 Resource_List.neednodes=1 Resource_List.nodect=1 Resource_List.nodes=1 Resource_List.walltime=02:00:00 session=28561 end=1319717372 Exit_status=0 resources_used.cput=00:00:00 resources_used.mem=0kb resources_used.vmem=0kb resources_used.walltime=00:00:00 |
---|
| 608 | */ |
---|
| 609 | while ((line = fsd_readline(fhandle)) != NULL) |
---|
| 610 | { |
---|
[35] | 611 | |
---|
| 612 | if (line[20] == 'E' && strncmp(line + 22, job_id, strlen(job_id)) == 0 ) |
---|
[34] | 613 | { |
---|
| 614 | char *p = NULL; |
---|
| 615 | |
---|
| 616 | fsd_log_debug(("Matched accounting log record = %s", line)); |
---|
| 617 | |
---|
| 618 | if (!(exec_host = strstr(line, "exec_host"))) |
---|
| 619 | { |
---|
| 620 | fsd_log_error(("Invalid accounting record: %s", exec_host)); |
---|
| 621 | break; |
---|
| 622 | } |
---|
| 623 | |
---|
| 624 | exec_host += 10; |
---|
| 625 | |
---|
| 626 | p = exec_host; |
---|
[35] | 627 | while (*p != ' ' && *p != '\0') |
---|
[34] | 628 | p++; |
---|
[35] | 629 | *p = '\0'; |
---|
[34] | 630 | |
---|
| 631 | break; |
---|
| 632 | } |
---|
| 633 | |
---|
| 634 | fsd_free(line); |
---|
| 635 | } |
---|
| 636 | |
---|
| 637 | if (exec_host) |
---|
| 638 | { |
---|
| 639 | fsd_log_info(("Job %s was executing on hosts %s.", job_id, exec_host)); |
---|
| 640 | exec_host = fsd_strdup(exec_host); |
---|
| 641 | } |
---|
| 642 | else |
---|
| 643 | { |
---|
[35] | 644 | fsd_log_error(("Could not find executions hosts for %s.", job_id)); |
---|
[34] | 645 | } |
---|
| 646 | |
---|
| 647 | if (line) |
---|
| 648 | fsd_free(line); |
---|
| 649 | |
---|
| 650 | fclose(fhandle); |
---|
| 651 | |
---|
| 652 | return exec_host; |
---|
[7] | 653 | } |
---|
| 654 | |
---|
[21] | 655 | |
---|