PageRenderTime 59ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c

https://github.com/cfenoy/slurm
C | 498 lines | 343 code | 56 blank | 99 comment | 77 complexity | 7c3bcffb719732b9c6b60b4875e26186 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * jobcomp_pgsql.c - Store/Get all information in a postgresql storage.
  3. *
  4. * $Id: storage_pgsql.c 10893 2007-01-29 21:53:48Z da $
  5. *****************************************************************************
  6. * Copyright (C) 2004-2007 The Regents of the University of California.
  7. * Copyright (C) 2008-2009 Lawrence Livermore National Security.
  8. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  9. * Written by Danny Auble <da@llnl.gov>
  10. *
  11. * This file is part of SLURM, a resource management program.
  12. * For details, see <http://www.schedmd.com/slurmdocs/>.
  13. * Please also read the included file: DISCLAIMER.
  14. *
  15. * SLURM is free software; you can redistribute it and/or modify it under
  16. * the terms of the GNU General Public License as published by the Free
  17. * Software Foundation; either version 2 of the License, or (at your option)
  18. * any later version.
  19. *
  20. * In addition, as a special exception, the copyright holders give permission
  21. * to link the code of portions of this program with the OpenSSL library under
  22. * certain conditions as described in each individual source file, and
  23. * distribute linked combinations including the two. You must obey the GNU
  24. * General Public License in all respects for all of the code used other than
  25. * OpenSSL. If you modify file(s) with this exception, you may extend this
  26. * exception to your version of the file(s), but you are not obligated to do
  27. * so. If you do not wish to do so, delete this exception statement from your
  28. * version. If you delete this exception statement from all source files in
  29. * the program, then also delete it here.
  30. *
  31. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  32. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  33. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  34. * details.
  35. *
  36. * You should have received a copy of the GNU General Public License along
  37. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  38. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  39. \*****************************************************************************/
  40. #include "pgsql_jobcomp_process.h"
  41. #include <pwd.h>
  42. #include <grp.h>
  43. #include <sys/types.h>
  44. #include "src/common/parse_time.h"
  45. #include "src/common/node_select.h"
  46. #include "src/common/uid.h"
  47. /*
  48. * These variables are required by the generic plugin interface. If they
  49. * are not found in the plugin, the plugin loader will ignore it.
  50. *
  51. * plugin_name - a string giving a human-readable description of the
  52. * plugin. There is no maximum length, but the symbol must refer to
  53. * a valid string.
  54. *
  55. * plugin_type - a string suggesting the type of the plugin or its
  56. * applicability to a particular form of data or method of data handling.
  57. * If the low-level plugin API is used, the contents of this string are
  58. * unimportant and may be anything. SLURM uses the higher-level plugin
  59. * interface which requires this string to be of the form
  60. *
  61. * <application>/<method>
  62. *
  63. * where <application> is a description of the intended application of
  64. * the plugin (e.g., "jobacct" for SLURM job completion logging) and <method>
  65. * is a description of how this plugin satisfies that application. SLURM will
  66. * only load job completion logging plugins if the plugin_type string has a
  67. * prefix of "jobacct/".
  68. *
  69. * plugin_version - an unsigned 32-bit integer giving the version number
  70. * of the plugin. If major and minor revisions are desired, the major
  71. * version number may be multiplied by a suitable magnitude constant such
  72. * as 100 or 1000. Various SLURM versions will likely require a certain
  73. * minimum version for their plugins as the job accounting API
  74. * matures.
  75. */
  76. const char plugin_name[] = "Job completion POSTGRESQL plugin";
  77. const char plugin_type[] = "jobcomp/pgsql";
  78. const uint32_t plugin_version = 100;
  79. PGconn *jobcomp_pgsql_db = NULL;
  80. char *jobcomp_table = "jobcomp_table";
  81. storage_field_t jobcomp_table_fields[] = {
  82. { "jobid", "integer not null" },
  83. { "uid", "smallint not null" },
  84. { "user_name", "text not null" },
  85. { "gid", "smallint not null" },
  86. { "group_name", "text not null" },
  87. { "name", "text not null" },
  88. { "state", "smallint not null" },
  89. { "partition", "text not null" },
  90. { "timelimit", "text not null" },
  91. { "starttime", "bigint default 0 not null" },
  92. { "endtime", "bigint default 0 not null" },
  93. { "nodelist", "text" },
  94. { "nodecnt", "integer not null" },
  95. { "proc_cnt", "integer not null" },
  96. { "connect_type", "text" },
  97. { "reboot", "text" },
  98. { "rotate", "text" },
  99. { "maxprocs", "integer default 0 not null" },
  100. { "geometry", "text" },
  101. { "start", "text" },
  102. { "blockid", "text" },
  103. { NULL, NULL}
  104. };
  105. /* Type for error string table entries */
  106. typedef struct {
  107. int xe_number;
  108. char *xe_message;
  109. } slurm_errtab_t;
  110. static slurm_errtab_t slurm_errtab[] = {
  111. {0, "No error"},
  112. {-1, "Unspecified error"}
  113. };
  114. /* A plugin-global errno. */
  115. static int plugin_errno = SLURM_SUCCESS;
  116. /* File descriptor used for logging */
  117. static pthread_mutex_t jobcomp_lock = PTHREAD_MUTEX_INITIALIZER;
  118. static pgsql_db_info_t *_pgsql_jobcomp_create_db_info()
  119. {
  120. pgsql_db_info_t *db_info = xmalloc(sizeof(pgsql_db_info_t));
  121. db_info->port = slurm_get_jobcomp_port();
  122. /* it turns out it is better if using defaults to let postgres
  123. handle them on it's own terms */
  124. if(!db_info->port) {
  125. db_info->port = DEFAULT_PGSQL_PORT;
  126. slurm_set_jobcomp_port(db_info->port);
  127. }
  128. db_info->host = slurm_get_jobcomp_host();
  129. db_info->user = slurm_get_jobcomp_user();
  130. db_info->pass = slurm_get_jobcomp_pass();
  131. return db_info;
  132. }
  133. static int _pgsql_jobcomp_check_tables(char *user)
  134. {
  135. int i = 0, job_found = 0;
  136. PGresult *result = NULL;
  137. char *query = xstrdup_printf("select tablename from pg_tables "
  138. "where tableowner='%s' "
  139. "and tablename !~ '^pg_+'", user);
  140. if(!(result =
  141. pgsql_db_query_ret(jobcomp_pgsql_db, query))) {
  142. xfree(query);
  143. return SLURM_ERROR;
  144. }
  145. xfree(query);
  146. for (i = 0; i < PQntuples(result); i++) {
  147. if(!job_found
  148. && !strcmp(jobcomp_table, PQgetvalue(result, i, 0)))
  149. job_found = 1;
  150. }
  151. PQclear(result);
  152. if(!job_found)
  153. if(pgsql_db_create_table(jobcomp_pgsql_db, "public", jobcomp_table,
  154. jobcomp_table_fields,
  155. ")") == SLURM_ERROR)
  156. return SLURM_ERROR;
  157. return SLURM_SUCCESS;
  158. }
  159. /* get the user name for the give user_id */
  160. static char *_get_user_name(uint32_t user_id)
  161. {
  162. static uint32_t cache_uid = 0;
  163. static char cache_name[32] = "root", *uname;
  164. char *ret_name = NULL;
  165. slurm_mutex_lock(&jobcomp_lock);
  166. if (user_id != cache_uid) {
  167. uname = uid_to_string((uid_t) user_id);
  168. snprintf(cache_name, sizeof(cache_name), "%s", uname);
  169. xfree(uname);
  170. cache_uid = user_id;
  171. }
  172. ret_name = xstrdup(cache_name);
  173. slurm_mutex_unlock(&jobcomp_lock);
  174. return ret_name;
  175. }
  176. /* get the group name for the give group_id */
  177. static char *_get_group_name(uint32_t group_id)
  178. {
  179. static uint32_t cache_gid = 0;
  180. static char cache_name[32] = "root", *gname;
  181. char *ret_name = NULL;
  182. slurm_mutex_lock(&jobcomp_lock);
  183. if (group_id != cache_gid) {
  184. gname = gid_to_string((gid_t) group_id);
  185. snprintf(cache_name, sizeof(cache_name), "%s", gname);
  186. xfree(gname);
  187. cache_gid = group_id;
  188. }
  189. ret_name = xstrdup(cache_name);
  190. slurm_mutex_unlock(&jobcomp_lock);
  191. return ret_name;
  192. }
  193. /*
  194. * Linear search through table of errno values and strings,
  195. * returns NULL on error, string on success.
  196. */
  197. static char *_lookup_slurm_api_errtab(int errnum)
  198. {
  199. char *res = NULL;
  200. int i;
  201. for (i = 0; i < sizeof(slurm_errtab) / sizeof(slurm_errtab_t); i++) {
  202. if (slurm_errtab[i].xe_number == errnum) {
  203. res = slurm_errtab[i].xe_message;
  204. break;
  205. }
  206. }
  207. return res;
  208. }
  209. /*
  210. * init() is called when the plugin is loaded, before any other functions
  211. * are called. Put global initialization here.
  212. */
  213. extern int init ( void )
  214. {
  215. static int first = 1;
  216. if(first) {
  217. /* since this can be loaded from many different places
  218. only tell us once. */
  219. verbose("%s loaded", plugin_name);
  220. first = 0;
  221. } else {
  222. debug4("%s loaded", plugin_name);
  223. }
  224. return SLURM_SUCCESS;
  225. }
  226. extern int fini ( void )
  227. {
  228. if (jobcomp_pgsql_db) {
  229. PQfinish(jobcomp_pgsql_db);
  230. jobcomp_pgsql_db = NULL;
  231. }
  232. return SLURM_SUCCESS;
  233. }
  234. extern int slurm_jobcomp_set_location(char *location)
  235. {
  236. pgsql_db_info_t *db_info = _pgsql_jobcomp_create_db_info();
  237. int rc = SLURM_SUCCESS;
  238. char *db_name = NULL;
  239. int i = 0;
  240. if(jobcomp_pgsql_db && PQstatus(jobcomp_pgsql_db) == CONNECTION_OK)
  241. return SLURM_SUCCESS;
  242. if(!location)
  243. db_name = slurm_get_jobcomp_loc();
  244. else {
  245. while(location[i]) {
  246. if(location[i] == '.' || location[i] == '/') {
  247. debug("%s doesn't look like a database "
  248. "name using %s",
  249. location, DEFAULT_JOB_COMP_DB);
  250. break;
  251. }
  252. i++;
  253. }
  254. if(location[i])
  255. db_name = xstrdup(DEFAULT_JOB_COMP_DB);
  256. else
  257. db_name = xstrdup(location);
  258. }
  259. debug2("pgsql_connect() called for db %s", db_name);
  260. pgsql_get_db_connection(&jobcomp_pgsql_db, db_name, db_info);
  261. xfree(db_name);
  262. rc = _pgsql_jobcomp_check_tables(db_info->user);
  263. destroy_pgsql_db_info(db_info);
  264. if(rc == SLURM_SUCCESS)
  265. debug("Jobcomp database init finished");
  266. else
  267. debug("Jobcomp database init failed");
  268. return rc;
  269. }
  270. extern int slurm_jobcomp_log_record(struct job_record *job_ptr)
  271. {
  272. int rc = SLURM_SUCCESS;
  273. char *usr_str = NULL, *grp_str = NULL, lim_str[32];
  274. char *connect_type = NULL, *reboot = NULL, *rotate = NULL,
  275. *geometry = NULL, *start = NULL,
  276. *blockid = NULL;
  277. enum job_states job_state;
  278. char *query = NULL;
  279. uint32_t time_limit, start_time, end_time;
  280. if(!jobcomp_pgsql_db || PQstatus(jobcomp_pgsql_db) != CONNECTION_OK) {
  281. char *loc = slurm_get_jobcomp_loc();
  282. if(slurm_jobcomp_set_location(loc) == SLURM_ERROR) {
  283. xfree(loc);
  284. return SLURM_ERROR;
  285. }
  286. xfree(loc);
  287. }
  288. usr_str = _get_user_name(job_ptr->user_id);
  289. grp_str = _get_group_name(job_ptr->group_id);
  290. if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr)
  291. time_limit = job_ptr->part_ptr->max_time;
  292. else
  293. time_limit = job_ptr->time_limit;
  294. if (time_limit == INFINITE)
  295. strcpy(lim_str, "UNLIMITED");
  296. else {
  297. snprintf(lim_str, sizeof(lim_str), "%lu",
  298. (unsigned long) time_limit);
  299. }
  300. /* Job will typically be COMPLETING when this is called.
  301. * We remove the flags to get the eventual completion state:
  302. * JOB_FAILED, JOB_TIMEOUT, etc. */
  303. if (IS_JOB_RESIZING(job_ptr)) {
  304. job_state = JOB_RESIZING;
  305. if (job_ptr->resize_time)
  306. start_time = job_ptr->resize_time;
  307. else
  308. start_time = job_ptr->start_time;
  309. end_time = time(NULL);
  310. } else {
  311. job_state = job_ptr->job_state & JOB_STATE_BASE;
  312. if (job_ptr->resize_time)
  313. start_time = job_ptr->resize_time;
  314. else if (job_ptr->start_time > job_ptr->end_time) {
  315. /* Job cancelled while pending and
  316. * expected start time is in the future. */
  317. start_time = 0;
  318. } else
  319. start_time = job_ptr->start_time;
  320. end_time = job_ptr->end_time;
  321. }
  322. connect_type = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
  323. SELECT_PRINT_CONNECTION);
  324. reboot = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
  325. SELECT_PRINT_REBOOT);
  326. rotate = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
  327. SELECT_PRINT_ROTATE);
  328. geometry = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
  329. SELECT_PRINT_GEOMETRY);
  330. start = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
  331. SELECT_PRINT_START);
  332. #ifdef HAVE_BG
  333. blockid = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
  334. SELECT_PRINT_BG_ID);
  335. #else
  336. blockid = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
  337. SELECT_PRINT_RESV_ID);
  338. #endif
  339. query = xstrdup_printf(
  340. "insert into %s (jobid, uid, user_name, gid, group_name, "
  341. "name, state, proc_cnt, partition, timelimit, "
  342. "starttime, endtime, nodecnt",
  343. jobcomp_table);
  344. if(job_ptr->nodes)
  345. xstrcat(query, ", nodelist");
  346. if(connect_type)
  347. xstrcat(query, ", connect_type");
  348. if(reboot)
  349. xstrcat(query, ", reboot");
  350. if(rotate)
  351. xstrcat(query, ", rotate");
  352. if(job_ptr->details && (job_ptr->details->max_cpus != NO_VAL))
  353. xstrcat(query, ", maxprocs");
  354. if(geometry)
  355. xstrcat(query, ", geometry");
  356. if(start)
  357. xstrcat(query, ", start");
  358. if(blockid)
  359. xstrcat(query, ", blockid");
  360. xstrfmtcat(query, ") values (%u, %u, '%s', %u, '%s', \"%s\", %d, %u, "
  361. "'%s', \"%s\", %u, %u, %u",
  362. job_ptr->job_id, job_ptr->user_id, usr_str,
  363. job_ptr->group_id, grp_str, job_ptr->name,
  364. job_state, job_ptr->total_cpus, job_ptr->partition, lim_str,
  365. start_time, end_time, job_ptr->node_cnt);
  366. if(job_ptr->nodes)
  367. xstrfmtcat(query, ", '%s'", job_ptr->nodes);
  368. if(connect_type) {
  369. xstrfmtcat(query, ", '%s'", connect_type);
  370. xfree(connect_type);
  371. }
  372. if(reboot) {
  373. xstrfmtcat(query, ", '%s'", reboot);
  374. xfree(reboot);
  375. }
  376. if(rotate) {
  377. xstrfmtcat(query, ", '%s'", rotate);
  378. xfree(rotate);
  379. }
  380. if(job_ptr->details && (job_ptr->details->max_cpus != NO_VAL))
  381. xstrfmtcat(query, ", '%u'", job_ptr->details->max_cpus);
  382. if(geometry) {
  383. xstrfmtcat(query, ", '%s'", geometry);
  384. xfree(geometry);
  385. }
  386. if(start) {
  387. xstrfmtcat(query, ", '%s'", start);
  388. xfree(start);
  389. }
  390. if(blockid) {
  391. xstrfmtcat(query, ", '%s'", blockid);
  392. xfree(blockid);
  393. }
  394. xstrcat(query, ")");
  395. //info("here is the query %s", query);
  396. rc = pgsql_db_query(jobcomp_pgsql_db, query);
  397. xfree(usr_str);
  398. return rc;
  399. }
  400. extern int slurm_jobcomp_get_errno()
  401. {
  402. return plugin_errno;
  403. }
  404. extern char *slurm_jobcomp_strerror(int errnum)
  405. {
  406. char *res = _lookup_slurm_api_errtab(errnum);
  407. return (res ? res : strerror(errnum));
  408. }
  409. /*
  410. * get info from the storage
  411. * in/out job_list List of job_rec_t *
  412. * note List needs to be freed when called
  413. */
  414. extern List slurm_jobcomp_get_jobs(slurmdb_job_cond_t *job_cond)
  415. {
  416. List job_list = NULL;
  417. if(!jobcomp_pgsql_db || PQstatus(jobcomp_pgsql_db) != CONNECTION_OK) {
  418. char *loc = slurm_get_jobcomp_loc();
  419. if(slurm_jobcomp_set_location(loc) == SLURM_ERROR) {
  420. xfree(loc);
  421. return NULL;
  422. }
  423. xfree(loc);
  424. }
  425. job_list = pgsql_jobcomp_process_get_jobs(job_cond);
  426. return job_list;
  427. }
  428. /*
  429. * expire old info from the storage
  430. */
  431. extern int slurm_jobcomp_archive(slurmdb_archive_cond_t *arch_cond)
  432. {
  433. if(!jobcomp_pgsql_db || PQstatus(jobcomp_pgsql_db) != CONNECTION_OK) {
  434. char *loc = slurm_get_jobcomp_loc();
  435. if(slurm_jobcomp_set_location(loc) == SLURM_ERROR) {
  436. xfree(loc);
  437. return SLURM_ERROR;
  438. }
  439. xfree(loc);
  440. }
  441. return pgsql_jobcomp_process_archive(arch_cond);
  442. }