#include #include #include #include #include #include #include #include #define LL_CKPT_UNCLEAR 1 #define LL_CKPT_ERROR -1 #define LL_CKPT_JOB_ID_ERR -2 #define LL_CKPT_MALLOC_ERR -3 #define LL_CKPT_SOCKET_ERR -4 #define LL_CKPT_CONF_ERR -6 const char * lldrmaa_err_ckpt(int _llerrno) { switch(_llerrno) { case API_OK: return "Checkpoint completed successfully."; case LL_CKPT_UNCLEAR: return "Checkpoint event did not receive status and the success or failure of the checkpoint is unclear."; case LL_CKPT_ERROR: return "Error occurred attempting to checkpoint."; case LL_CKPT_JOB_ID_ERR: return "Format not valid for job step, not in the form host.jobid.stepid."; case LL_CKPT_MALLOC_ERR: return "Cannot allocate memory."; case LL_CKPT_SOCKET_ERR: return "API cannot create listen socket."; case LL_CKPT_CONF_ERR: return "Configuration file errors."; default: assert(0); return "Invalid error code"; } } /* int lldrmaa_map_ckpt(int _llerrno) { switch(_llerrno) { case API_OK: return FSD_ERRNO_SUCCESS; case LL_CKPT_UNCLEAR: return FSD_ERRNO_INTERNAL_ERROR; case LL_CKPT_ERROR: return FSD_DRMAA_ERRNO_SUSPEND_INCONSISTENT_STATE; case LL_CKPT_JOB_ID_ERR: return FSD_ERRNO_INTERNAL_ERROR; case LL_CKPT_MALLOC_ERR: return FSD_ERRNO_NO_MEMORY; case LL_CKPT_SOCKET_ERR: return FSD_ERRNO_INTERNAL_ERROR; case LL_CKPT_CONF_ERR: return FSD_ERRNO_DENIED_BY_DRM; default: fsd_assert(false); return FSD_ERRNO_INTERNAL_ERROR; } } */ void ckpt_job(char * job_id) { int rc = 0; char buf[256]; cr_error_t cp; LL_ckpt_info ckpt_info; ckpt_info.version=LL_API_VERSION; ckpt_info.step_id=strdup(job_id); ckpt_info.ckptType=CKPT_AND_TERMINATE; ckpt_info.waitType=CKPT_NO_WAIT; ckpt_info.abort_sig=SIGINT; ckpt_info.cp_error_data=NULL; ckpt_info.ckpt_rc = 0; ckpt_info.soft_limit = 0; ckpt_info.hard_limit = 0; rc = ll_ckpt(&ckpt_info); /* TODO: segfault */ if(rc) { printf("job::control: could to job %s. Error code: %d,%s\n", job_id, rc, lldrmaa_err_ckpt(rc) ); printf("Return code from checkpoint: %d\n",ckpt_info.ckpt_rc); printf("Error data: %s\n",cp.error_data); strerror_r(cp.Py_error,buf,256); printf("cp_error_data %d: %s\n",cp.Py_error,buf); strerror_r(cp.Sy_error,buf,256); printf("cp_error_data2 %d: %s\n",cp.Sy_error,buf); } } int main(int argc, char *argv[]) { if(argc<2) { printf("./a \n"); exit(1); } printf("Trying to checkpoint %s\n",argv[1]); ckpt_job(argv[1]); llsubmit(NULL,NULL,NULL,NULL,0); printf("After ckpt\n"); return 0; }