1 | #include <stdio.h> |
---|
2 | #include <stdlib.h> |
---|
3 | |
---|
4 | #include <llapi.h> |
---|
5 | #include <unistd.h> |
---|
6 | #include <fcntl.h> |
---|
7 | #include <string.h> |
---|
8 | #include <errno.h> |
---|
9 | #include <assert.h> |
---|
10 | |
---|
11 | #define LL_CKPT_UNCLEAR 1 |
---|
12 | #define LL_CKPT_ERROR -1 |
---|
13 | #define LL_CKPT_JOB_ID_ERR -2 |
---|
14 | #define LL_CKPT_MALLOC_ERR -3 |
---|
15 | #define LL_CKPT_SOCKET_ERR -4 |
---|
16 | #define LL_CKPT_CONF_ERR -6 |
---|
17 | |
---|
18 | const char * |
---|
19 | lldrmaa_err_ckpt(int _llerrno) |
---|
20 | { |
---|
21 | switch(_llerrno) |
---|
22 | { |
---|
23 | case API_OK: |
---|
24 | return "Checkpoint completed successfully."; |
---|
25 | case LL_CKPT_UNCLEAR: |
---|
26 | return "Checkpoint event did not receive status and the success or failure of the checkpoint is unclear."; |
---|
27 | case LL_CKPT_ERROR: |
---|
28 | return "Error occurred attempting to checkpoint."; |
---|
29 | case LL_CKPT_JOB_ID_ERR: |
---|
30 | return "Format not valid for job step, not in the form host.jobid.stepid."; |
---|
31 | case LL_CKPT_MALLOC_ERR: |
---|
32 | return "Cannot allocate memory."; |
---|
33 | case LL_CKPT_SOCKET_ERR: |
---|
34 | return "API cannot create listen socket."; |
---|
35 | case LL_CKPT_CONF_ERR: |
---|
36 | return "Configuration file errors."; |
---|
37 | default: |
---|
38 | assert(0); |
---|
39 | return "Invalid error code"; |
---|
40 | } |
---|
41 | } |
---|
42 | /* |
---|
43 | int |
---|
44 | lldrmaa_map_ckpt(int _llerrno) |
---|
45 | { |
---|
46 | switch(_llerrno) |
---|
47 | { |
---|
48 | case API_OK: |
---|
49 | return FSD_ERRNO_SUCCESS; |
---|
50 | case LL_CKPT_UNCLEAR: |
---|
51 | return FSD_ERRNO_INTERNAL_ERROR; |
---|
52 | case LL_CKPT_ERROR: |
---|
53 | return FSD_DRMAA_ERRNO_SUSPEND_INCONSISTENT_STATE; |
---|
54 | case LL_CKPT_JOB_ID_ERR: |
---|
55 | return FSD_ERRNO_INTERNAL_ERROR; |
---|
56 | case LL_CKPT_MALLOC_ERR: |
---|
57 | return FSD_ERRNO_NO_MEMORY; |
---|
58 | case LL_CKPT_SOCKET_ERR: |
---|
59 | return FSD_ERRNO_INTERNAL_ERROR; |
---|
60 | case LL_CKPT_CONF_ERR: |
---|
61 | return FSD_ERRNO_DENIED_BY_DRM; |
---|
62 | default: |
---|
63 | fsd_assert(false); |
---|
64 | return FSD_ERRNO_INTERNAL_ERROR; |
---|
65 | } |
---|
66 | } |
---|
67 | */ |
---|
68 | |
---|
69 | void ckpt_job(char * job_id) |
---|
70 | { |
---|
71 | int rc = 0; |
---|
72 | char buf[256]; |
---|
73 | cr_error_t cp; |
---|
74 | LL_ckpt_info ckpt_info; |
---|
75 | ckpt_info.version=LL_API_VERSION; |
---|
76 | ckpt_info.step_id=strdup(job_id); |
---|
77 | ckpt_info.ckptType=CKPT_AND_TERMINATE; |
---|
78 | ckpt_info.waitType=CKPT_NO_WAIT; |
---|
79 | ckpt_info.abort_sig=SIGINT; |
---|
80 | ckpt_info.cp_error_data=NULL; |
---|
81 | ckpt_info.ckpt_rc = 0; |
---|
82 | ckpt_info.soft_limit = 0; |
---|
83 | ckpt_info.hard_limit = 0; |
---|
84 | |
---|
85 | rc = ll_ckpt(&ckpt_info); /* TODO: segfault */ |
---|
86 | |
---|
87 | if(rc) |
---|
88 | { |
---|
89 | printf("job::control: could to job %s. Error code: %d,%s\n", job_id, rc, lldrmaa_err_ckpt(rc) ); |
---|
90 | |
---|
91 | printf("Return code from checkpoint: %d\n",ckpt_info.ckpt_rc); |
---|
92 | printf("Error data: %s\n",cp.error_data); |
---|
93 | strerror_r(cp.Py_error,buf,256); |
---|
94 | printf("cp_error_data %d: %s\n",cp.Py_error,buf); |
---|
95 | strerror_r(cp.Sy_error,buf,256); |
---|
96 | printf("cp_error_data2 %d: %s\n",cp.Sy_error,buf); |
---|
97 | } |
---|
98 | } |
---|
99 | |
---|
100 | int main(int argc, char *argv[]) |
---|
101 | { |
---|
102 | if(argc<2) |
---|
103 | { |
---|
104 | printf("./a <job_id>\n"); |
---|
105 | exit(1); |
---|
106 | } |
---|
107 | |
---|
108 | printf("Trying to checkpoint %s\n",argv[1]); |
---|
109 | |
---|
110 | ckpt_job(argv[1]); |
---|
111 | llsubmit(NULL,NULL,NULL,NULL,0); |
---|
112 | printf("After ckpt\n"); |
---|
113 | |
---|
114 | return 0; |
---|
115 | } |
---|