/* $Id: dcm.pc 3970 2010-01-31 05:52:04Z zacheiss $
 *
 * The Data Control Manager for Moira.
 *
 * Copyright (C) 1987-1998 by the Massachusetts Institute of Technology.
 * For copying and distribution information, see the file
 * <mit-copyright.h>.
 */

#include <mit-copyright.h>
#include <moira.h>
#include <moira_site.h>
#include <moira_schema.h>
#include "update.h"

#include <sys/param.h>
#include <sys/stat.h>
#include <sys/wait.h>

#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

EXEC SQL INCLUDE sqlca;
void sqlglm(char *, unsigned int *, unsigned int *);

RCSID("$HeadURL: svn+ssh://svn.mit.edu/moira/trunk/moira/dcm/dcm.pc $ $Id: dcm.pc 3970 2010-01-31 05:52:04Z zacheiss $");

int generate_service(char *name, int force);
void do_hosts(char *service);
int dcm_send_file(char *service, int type, char *host, char *target);
int dcm_execute(char *service, char *host, char *script);
void dbmserr(void);

#define SQL_NO_MATCH 1403
#define SOFT_FAIL(x) (((x) == MR_NO_MEM) || ((x) == MR_CANT_CONNECT) || ((x) == MR_CCONFIG) || ((x) == MR_DEADLOCK) || ((x) == MR_BUSY) || ((x) == MR_ABORT))

/* argument parsing macro */
#define argis(a, b) (!strcmp(*arg + 1, a) || !strcmp(*arg + 1, b))

char whobuf[256], *whoami = whobuf, *db = "moira";

enum { UNIQUE, DISTRIBUTED, REPLICATED };

int main(int argc, char **argv)
{
  int i, force = 0;
  EXEC SQL BEGIN DECLARE SECTION;
  char buf[SERVERS_NAME_SIZE], *name;
  int enable;
  EXEC SQL END DECLARE SECTION;
  struct save_queue *sq;
  int status, srvcnt = 0;
  char **arg = argv, *services[BUFSIZ];

  if (strchr(argv[0], '/'))
    strcpy(whoami, strrchr(argv[0], '/') + 1);
  else strcpy(whoami, argv[0]);
  umask(7);

  setvbuf(stderr, NULL, _IOLBF, BUFSIZ);
  setvbuf(stdout, NULL, _IOLBF, BUFSIZ);

  initialize_sms_error_table();
  initialize_krb_error_table();

  while (++arg - argv < argc)
    {
      if (**arg == '-')
	{
	  if (argis("f", "force"))
	    force++;
	  else
	    {
	      com_err(whoami, 0, "Usage: %s [-f] servicename", argv[0]);
	      exit(1);
	    }
	}
      else
	/* Doesn't begin with a dash, is a service name.
	 * Build an array of them we can iterate through later.
	 */
	{
	  services[srvcnt] = malloc(SERVERS_NAME_SIZE);
	  if (!services[srvcnt])
	    {
	      com_err(whoami, 0, "Out of memory!");
	      exit(1);
	    }
	  strncpy(services[srvcnt], *arg, SERVERS_NAME_SIZE);
	  srvcnt++;
	}
    }

  /* Iterate through services specified on the command line, if any. */
  if (srvcnt > 0)
    {
      for (i = 0; i < srvcnt; i++)
	{
	  if (generate_service(services[i], force))
	    {
	      do_hosts(services[i]);
	      free(services[i]);
	    }
	}
      exit(0);
    }

  /* if DCM is not enabled, exit after logging */
  if (!access(NODCMFILE, F_OK))
    {
      printf("/etc/nodcm exists -- exiting\n");
      exit(1);
    }

  EXEC SQL WHENEVER SQLERROR DO dbmserr();

  EXEC SQL CONNECT :db IDENTIFIED BY :db;

  EXEC SQL SELECT value INTO :enable FROM numvalues WHERE name = 'dcm_enable';
  if (enable == 0)
    {
      printf("dcm_enable not set -- exiting\n");
      exit(1);
    }

  /* fetch list of services */
  EXEC SQL DECLARE csr_svc CURSOR FOR SELECT LOWER(name) FROM servers
    WHERE enable = 1 AND harderror = 0 AND update_int > 0;
  EXEC SQL OPEN csr_svc;
  sq = sq_create();
  while (1)
    {
      EXEC SQL FETCH csr_svc INTO :buf;
      if (sqlca.sqlcode)
	break;

      sq_save_data(sq, strdup(strtrim(buf)));
    }
  EXEC SQL CLOSE csr_svc;
  /* we will repeatedly open and close the db since it seems to get
     upset if you keep it open across a fork */
  EXEC SQL COMMIT RELEASE;

  /* Now run through list */
  while (sq_get_data(sq, &name))
    {
      if (generate_service(name, force))
	{
	  switch (fork())
	    {
	    case -1:
	      com_err(whoami, errno, "forking for service %s -- exiting",
		      name);
	      exit(1);
	    case 0:
	      sprintf(strchr(whoami, '\0'), " (%s:%ld)", name, (long)getpid());
	      do_hosts(name);
	      com_err(whoami, 0, "exiting");
	      exit(0);
	    default:
	      break;
	    }
	}
    }

  com_err(whoami, 0, "All files generated. Waiting for children to exit");
  while (waitpid(0, &status, 0) > 0)
    ;
  com_err(whoami, 0, "exiting");
  exit(0);
}

int generate_service(char *name, int force)
{
  EXEC SQL BEGIN DECLARE SECTION;
  int interval, dfcheck, status, inprogress;
  time_t now;
  const char *errmsg;
  EXEC SQL END DECLARE SECTION;
  char dfgen_prog[MAXPATHLEN], dfgen_cmd[2 * MAXPATHLEN];
  struct sigaction action, prevaction;
  int waits;

  EXEC SQL CONNECT :db IDENTIFIED BY :db;

  EXEC SQL SELECT update_int, dfcheck, inprogress INTO :interval, :dfcheck,
    :inprogress FROM servers WHERE name = UPPER(:name);
  if (sqlca.sqlcode == SQL_NO_MATCH)
    {
      com_err(whoami, 0, "No such service `%s'", name);
      EXEC SQL COMMIT RELEASE;
      return 0;
    }

  /* Someone might try to run a DCM from the command line while the
   * regular one is running, which will bypass the "interval" test.
   * Check inprogress to make sure they don't stomp on themselves.
   *
   * Note that there is still a race condition here, and this doesn't
   * absolutely prevent 2 DCMs from stepping on one another, but it
   * does reduce the window of vulnerability greatly.
   */
  if (inprogress == 1)
    {
      com_err(whoami, 0, "DCM for service `%s' already in progress", name);
      EXEC SQL COMMIT RELEASE;
      return 0;
    }

  time(&now);

  if ((interval * 60 + dfcheck < now) || force)
    {
      sprintf(dfgen_prog, "%s/%s.gen", BIN_DIR, name);
      if (access(dfgen_prog, F_OK) != 0)
	{
	  com_err(whoami, 0, "prog %s doesn't exist", dfgen_prog);
	  EXEC SQL COMMIT RELEASE;
	  return 0;
	}
      sprintf(dfgen_cmd, "exec %s %s/%s.out", dfgen_prog, DCM_DIR, name);
      com_err(whoami, 0, "running %s", dfgen_prog);

      EXEC SQL WHENEVER SQLERROR GOTO gen_cleanup;

      EXEC SQL UPDATE servers SET inprogress = 1
	WHERE name = UPPER(:name);
      EXEC SQL COMMIT;

      action.sa_flags = 0;
      sigemptyset(&action.sa_mask);
      action.sa_handler = SIG_DFL;
      sigaction(SIGCHLD, &action, &prevaction);
      waits = system(dfgen_cmd);
      sigaction(SIGCHLD, &prevaction, NULL);
      if (WIFSIGNALED(waits))
	{
	  status = MR_COREDUMP;
	  com_err(whoami, status, " %s exited on signal %d",
		  dfgen_prog, WTERMSIG(waits));
	}
      else if (WEXITSTATUS(waits))
	{
	  /* extract the process's exit value */
	  status = WEXITSTATUS(waits) + ERROR_TABLE_BASE_sms;
	  if (status != MR_NO_CHANGE)
	    com_err(whoami, status, "in %s", dfgen_prog);
	}
      else
	status = MR_SUCCESS;

      if (status == MR_SUCCESS)
	{
	  EXEC SQL UPDATE servers SET dfgen = :now, dfcheck = :now,
	    inprogress = 0 WHERE name = UPPER(:name);
	  EXEC SQL COMMIT RELEASE;
	  return 1;
	}
      else if (status == MR_NO_CHANGE)
	{
	  EXEC SQL UPDATE servers SET dfcheck = :now, inprogress = 0
	    WHERE name = UPPER(:name);
	}
      else if (SOFT_FAIL(status))
	{
	  errmsg = error_message(status);
	  EXEC SQL UPDATE servers SET errmsg = :errmsg, inprogress = 0
	    WHERE name = UPPER(:name);
	}
      else /* HARD_FAIL(status) */
	{
	  errmsg = error_message(status);
	  EXEC SQL UPDATE servers SET harderror = :status, errmsg = :errmsg,
	    inprogress = 0 WHERE name = UPPER(:name);
	  critical_alert(whoami, "DCM", "DCM building config files for %s: %s",
			 name, errmsg);
	}
    }
  else
    {
      com_err(whoami, 0, "DCM for service `%s' has run too recently.", name);
      com_err(whoami, 0, "Use the -force flag to force a DCM.");
    }

  EXEC SQL COMMIT RELEASE;
  return 0;

gen_cleanup:
  EXEC SQL WHENEVER SQLERROR DO dbmserr(); 
  EXEC SQL UPDATE servers SET inprogress = 0, harderror = MR_INTERNAL,
    errmsg = 'DBMS Internal Error' WHERE name = UPPER(:name);
  dbmserr();
}

void do_hosts(char *service)
{
  EXEC SQL BEGIN DECLARE SECTION;
  char server_type[SERVERS_TYPE_SIZE], host[MACHINE_NAME_SIZE], *name;
  char target[SERVERS_TARGET_FILE_SIZE], script[SERVERS_SCRIPT_SIZE];
  const char *errmsg;
  int status = 0, dfgen, type, mid, inprogress;
  time_t now;
  EXEC SQL END DECLARE SECTION;
  struct save_queue *sq;

  time(&now);
  mr_init();

  EXEC SQL CONNECT :db IDENTIFIED BY :db;

  EXEC SQL SELECT dfgen, type, target_file, script, inprogress
    INTO :dfgen, :server_type, :target, :script, :inprogress
    FROM servers WHERE name = UPPER(:service);
  if (!strncmp(strtrim(server_type), "REPLICAT", 8))
    type = REPLICATED;
  else if (!strncmp(server_type, "DISTRIB", 8))
    type = DISTRIBUTED;
  else
    type = UNIQUE;
  strtrim(target);
  strtrim(script);

  /* Rudimentary locking.  Doesn't eliminate the possibility of 2 DCMs
   * stepping on one another, but makes it harder.
   */
  if (inprogress == 1)
    {
      com_err(whoami, 0, "DCM for service `%s' already in progress", name);
      EXEC SQL COMMIT RELEASE;
      return;
    }

  EXEC SQL DECLARE csr_hst1 CURSOR FOR
    SELECT m.name, m.mach_id FROM machine m, serverhosts sh
    WHERE sh.service = UPPER(:service)
    AND sh.enable = 1 AND sh.hosterror = 0
    AND sh.lts < :dfgen AND sh.mach_id = m.mach_id;
  EXEC SQL OPEN csr_hst1;
  sq = sq_create();
  while (1)
    {
      EXEC SQL FETCH csr_hst1 INTO :host, mid;
      if (sqlca.sqlcode == SQL_NO_MATCH)
	break;

      sq_save_data(sq, strdup(strtrim(host)));
      sq_save_data(sq, (void *)(long)mid);
    }
  EXEC SQL CLOSE csr_hst1;

  EXEC SQL WHENEVER SQLERROR GOTO host_cleanup;
  while (sq_get_data(sq, &name))
    {
      sq_get_data(sq, &mid);

      EXEC SQL SELECT inprogress INTO :inprogress FROM serverhosts
	WHERE service = UPPER(:service) AND mach_id = :mid;
      /* Check if someone got here before we did.
       * There's still a race condition here, but it's a small one. */
      if (inprogress == 1)
	{
	  com_err(whoami, 0, "DCM for service `%s' to host `%s' already in progress", service, name);
	  EXEC SQL COMMIT RELEASE;
	  return;
	}

      com_err(whoami, 0, "sending %s data to %s", service, name);
      EXEC SQL UPDATE serverhosts SET inprogress = 1
	WHERE service = UPPER(:service) AND mach_id = :mid;
      EXEC SQL COMMIT;
      status = dcm_send_file(service, type, name, target);
      if (status)
	{
	  errmsg = error_message(status);
	  EXEC SQL UPDATE serverhosts SET hosterrmsg = :errmsg,
	    inprogress = 0, success = 0, ltt = :now
	    WHERE service = UPPER(:service) AND mach_id = :mid;
	  if (!SOFT_FAIL(status))
	    {
	      EXEC SQL UPDATE serverhosts SET hosterror = :status
		WHERE service = UPPER(:service) AND mach_id = :mid;
	      critical_alert(whoami, "DCM", "DCM updating %s:%s: %s",
			     service, name, errmsg);
	    }
	  EXEC SQL COMMIT;

	  if (type == REPLICATED)
	    break;
	}
    }
  sq_destroy(sq);

  if (status == MR_SUCCESS || type != REPLICATED)
    {
      EXEC SQL DECLARE csr_hst2 CURSOR FOR
	SELECT m.name, m.mach_id FROM machine m, serverhosts sh
	WHERE sh.service = UPPER(:service) AND sh.inprogress = 1
	AND sh.enable = 1 AND sh.hosterror = 0 AND sh.mach_id = m.mach_id;
      EXEC SQL OPEN csr_hst2;
      sq = sq_create();

      while (1)
	{
	  EXEC SQL FETCH csr_hst2 INTO :host, :mid;
	  if (sqlca.sqlcode == SQL_NO_MATCH)
	    break;

	  sq_save_data(sq, strdup(strtrim(host)));
	  sq_save_data(sq, (void *)(long)mid);
	}
      EXEC SQL CLOSE csr_hst2;

      while (sq_get_data(sq, &name))
	{
	  sq_get_data(sq, &mid);

	  com_err(whoami, 0, "executing instructions on %s", name);
	  status = dcm_execute(service, name, script);
	  if (status)
	    {
	      errmsg = error_message(status);
	      EXEC SQL UPDATE serverhosts SET hosterrmsg = :errmsg,
		inprogress = 0, success = 0, ltt = :now
		WHERE service = UPPER(:service) AND mach_id = :mid;
	      if (!SOFT_FAIL(status))
		{
		  EXEC SQL UPDATE serverhosts SET hosterror = :status
		    WHERE service = UPPER(:service) AND mach_id = :mid;
		  critical_alert(whoami, "DCM", "DCM updating %s:%s: %s",
				 service, name, errmsg);
		}

	      if (type == REPLICATED)
		break;
	    }
	  else
	    {
	      EXEC SQL UPDATE serverhosts SET inprogress = 0, ltt = :now,
		lts = :now, success = 1 WHERE service = UPPER(:service)
		AND mach_id = :mid;
	    }
	  EXEC SQL COMMIT;
	}
      EXEC SQL CLOSE csr_hst2;
    }

  if (type == REPLICATED)
    {
      /* Clear inprogress flag on any hosts we started but didn't
       * finish.
       */
      EXEC SQL UPDATE serverhosts SET inprogress = 0
	WHERE service = UPPER(:service);
    }

  EXEC SQL WHENEVER SQLERROR DO dbmserr();
  if (status && !SOFT_FAIL(status) && type == REPLICATED)
    {
      EXEC SQL UPDATE servers SET harderror = :status, errmsg = :errmsg
	WHERE name = UPPER(:service);
    }

  EXEC SQL COMMIT RELEASE;
  return;

host_cleanup:
  EXEC SQL UPDATE serverhosts SET inprogress = 0, success = 0, ltt = :now,
    hosterror = MR_INTERNAL, hosterrmsg = 'DBMS Internal Error'
    WHERE service = UPPER(:service) AND mach_id = :mid;
  if (type == REPLICATED)
    {
      EXEC SQL UPDATE servers SET harderror = MR_INTERNAL,
	errmsg = 'DBMS Internal Error' WHERE name = UPPER(:service);
    }
}

int dcm_send_file(char *service, int type, char *host, char *target)
{
  char data[MAXPATHLEN];
  int code, conn;

  conn = mr_connect_internal(host, "moira_update");
  if (!conn)
    {
      com_err(whoami, errno, "can't connect to %s", host);
      return MR_CANT_CONNECT;
    }

  code = mr_send_krb5_auth(conn, host);
  if (code)
    code = mr_send_auth(conn, host);
  if (code)
    {
      com_err(whoami, code, "authenticating to %s", host);
      goto done;
    }

  if (type == DISTRIBUTED)
    sprintf(data, "%s/%s/%s", DCM_DIR, service, host);
  else
    sprintf(data, "%s/%s.out", DCM_DIR, service);
  code = mr_send_file(conn, data, target, 0);
  if (code)
    com_err(whoami, code, "sending data to %s", host);

done:
  mr_send_quit(conn);
  close(conn);
  return code;
}

int dcm_execute(char *service, char *host, char *script)
{
  char inst[MAXPATHLEN];
  int code, conn;

  conn = mr_connect_internal(host, "moira_update");
  if (!conn)
    {
      com_err(whoami, errno, "can't connect to %s", host);
      return MR_CANT_CONNECT;
    }

  code = mr_send_krb5_auth(conn, host);
  if (code)
    code = mr_send_auth(conn, host);
  if (code)
    {
      com_err(whoami, code, "authenticating to %s", host);
      goto done;
    }

  sprintf(inst, "/tmp/moira-update.XXXXXX");
  mkstemp(inst);
  code = mr_send_file(conn, script, inst, 0);
  if (code)
    {
      com_err(whoami, code, "sending instructions to %s", host);
      goto done;
    }

  code = mr_execute(conn, inst);
  if (code)
    com_err(whoami, code, "executing instructions on %s", host);

done:
  mr_send_quit(conn);
  close(conn);
  return code;
}

void dbmserr(void)
{
  EXEC SQL BEGIN DECLARE SECTION;
  char err_msg[256];
  EXEC SQL END DECLARE SECTION;
  int bufsize = 256, msglength = 0;

  sqlglm(err_msg, &bufsize, &msglength);
  err_msg[msglength] = '\0';
  com_err(whoami, 0, "Encountered SQL error:\n%s", err_msg);
  com_err(whoami, 0, "exiting");
  exit(1);
}
