#ifdef PETSC_RCS_HEADER
static char vcid[] = "$Id: mlSerial.c,v 1.12 2000/10/22 20:27:52 knepley Exp $";
#endif
/*
   Defines the serial multilevel preconditioner routines
*/
#include "src/sles/pc/pcimpl.h"    /*I "pc.h" I*/
#include "ml.h"

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelDoQR_Private"
PetscTruth PCMultiLevelDoQR_Private(PC pc, int numRows, int numCols)
{
  /* This functions decides whether to QR factorize the matrix U of an SVD.
     This is a memory saving device when B = UDV is long and skinny.
  */
  PC_Multilevel *ml = (PC_Multilevel *) pc->data;

  PetscFunctionBegin;
  if (numRows > (int) numCols*ml->DoQRFactor) {
    PetscFunctionReturn(PETSC_TRUE);
  } else {
    PetscFunctionReturn(PETSC_FALSE);
  }
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelPartitionMesh_Private"
int PCMultiLevelPartitionMesh_Private(PC pc, int level, int **mesh, int *numParts, int *colPartition)
{
  PC_Multilevel *ml = (PC_Multilevel *) pc->data;
  PetscTruth     change;
  int            numCols, freeCols;
  int            numAdj, newNumAdj, numFound;
  int           *offsets;
  int           *adj;
  int           *sizes;
  int            part, col, newCol, adjCol, newAdjCol;
  int            ierr;

  PetscFunctionBegin;
  ierr = PC_MLLogEventBegin(PC_ML_ReducePartitionMesh, pc, 0, 0, 0);                                      CHKERRQ(ierr);
  if (ml->useMath == PETSC_FALSE) {
    offsets = mesh[MESH_OFFSETS];
    adj     = mesh[MESH_ADJ];
    /* Set initial number of partitions */
    numCols   = ml->numVertices[level];
    *numParts = numCols/ml->partSize;
    /* If the graph is small enough, just return the whole thing */
    if (*numParts == 0) {
      *numParts = 1;
      ierr = PetscMemzero(colPartition, numCols * sizeof(int));                                           CHKERRQ(ierr);
      ierr = PC_MLLogEventEnd(PC_ML_ReducePartitionMesh, pc, 0, 0, 0);                                    CHKERRQ(ierr);
      PetscFunctionReturn(0);
    }
    /* Get connectivity information */
    /* Initialize partition and partition sizes */
    ierr = PetscMalloc((*numParts) * sizeof(int), &sizes);                                                CHKERRQ(ierr);
    for(col = 0; col < numCols; col++) colPartition[col] = -1;
    /* Pick numParts starting points -- Conforms to the Mathematica */
    for(part = 1; part <= *numParts; part++) {
      colPartition[ml->partSize*part-1] = part-1;
      sizes[part-1] = 1;
    }
    /* Start enlarging partitions */
    freeCols = numCols - (*numParts);
    change   = PETSC_TRUE;
    while(freeCols > 0)
    {
      /* Quit if nothing more can be done */
      if (change == PETSC_FALSE)
        break;
      change = PETSC_FALSE;
      /* Add a node to each partition if possible */
      for(part = 0; part < (*numParts); part++)
      {
        /* Look for the column adjacent to the most columns in this partition */
        newCol = -1;
        numAdj = 0;
        for(col = 0, numFound = 0; (col < numCols) && (numFound < sizes[part]); col++)
        {
          if (colPartition[col] == part)
          {
            numFound++;
            /* Look for adjacent columns ignoring self edge */
            for(adjCol = offsets[col]+1; adjCol < offsets[col+1]; adjCol++)
            {
              /* Ignore columns already in a partition */
              if (colPartition[adj[adjCol]] >= 0)
                continue;
              newNumAdj = 0;
              /* Count the number of columns adjacent to this one that are already in the partition */
              for(newAdjCol = offsets[adj[adjCol]]+1; newAdjCol < offsets[adj[adjCol]+1]; newAdjCol++)
                if (colPartition[adj[newAdjCol]] == part)
                  newNumAdj++;
              if ((newNumAdj > numAdj) || ((newNumAdj == numAdj) && (adj[adjCol] < newCol)))
              {
                newCol = adj[adjCol];
                numAdj = newNumAdj;
              }
            }
          }
        }
        if (newCol >= 0)
        {
          colPartition[newCol] = part;
          sizes[part]++;
          freeCols--;
          change = PETSC_TRUE;
        }
      }
    }
    /* Take care of extra columns */
    if (change == PETSC_FALSE) {
      for(col = 0; col < numCols; col++)
        if (colPartition[col] < 0)
          colPartition[col] = (*numParts)++;
    }
    ierr = PetscFree(sizes);                                                                              CHKERRQ(ierr);
  } else {
#ifdef PETSC_HAVE_MATHEMATICA
    ierr = ViewerMathematicaPartitionMesh(ml->mathViewer, ml->numElements[level], ml->numVertices[level],
                                          ml->vertices, mesh, numParts, colPartition);
                                                                                                          CHKERRQ(ierr);
#else
    SETERRQ(PETSC_ERR_SUP, " ");
#endif
  }
  ierr = PC_MLLogEventEnd(PC_ML_ReducePartitionMesh, pc, 0, 0, 0);                                        CHKERRQ(ierr);

  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelPartitionCols_Private"
int PCMultiLevelPartitionCols_Private(PC pc, int level, int numParts, int numCols, int *colPartition, int *globalCols)
{
  PC_Multilevel *ml = (PC_Multilevel *) pc->data;
  int            numLocCols;
  int            part, col;
  int            ierr;

  PetscFunctionBegin;
  ierr = PetscMalloc(numParts * sizeof(int), &ml->numPartitionCols[level]);                               CHKERRQ(ierr);
  PetscLogObjectMemory(pc, numParts * sizeof(int));
  ierr = PetscMemzero(ml->numPartitionCols[level], numParts * sizeof(int));                               CHKERRQ(ierr);
  for(col = 0; col < numCols; col++) {
    ml->numPartitionCols[level][colPartition[col]]++;
  }
  ierr = PetscMalloc(numParts * sizeof(int *), &ml->colPartition[level]);                                 CHKERRQ(ierr);
  PetscLogObjectMemory(pc, numParts * sizeof(int *));
  for(part = 0; part < numParts; part++) {
    numLocCols = ml->numPartitionCols[level][part];
    ierr = PetscMalloc(numLocCols * sizeof(int), &ml->colPartition[level][part]);                         CHKERRQ(ierr);
    PetscLogObjectMemory(pc, numLocCols * sizeof(int));
    ml->numPartitionCols[level][part] = 0;
  }
  for(col = 0; col < numCols; col++) {
    ml->colPartition[level][colPartition[col]][ml->numPartitionCols[level][colPartition[col]]++] = globalCols[col];
  }
  for(part = 0, col = 0; part < numParts; part++) {
    col += ml->numPartitionCols[level][part];
  }
  if (col != numCols) {
    int rank;

    ierr = MPI_Comm_rank(pc->comm, &rank);                                                                CHKERRQ(ierr);
    PetscPrintf(PETSC_COMM_SELF, "[%d] %d does not equal the number of columns %d\n", rank, col, numCols);
    for(col = 0; col < numCols; col++) {
      PetscPrintf(PETSC_COMM_SELF, "[%d] part: %d col: %d\n", rank, part, colPartition[col]);
    }
    SETERRQ(PETSC_ERR_PLIB, "Invalid column partition");
  }

  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelPartitionRows_Private"
int PCMultiLevelPartitionRows_Private(PC pc, int level, int numParts, int numCols, int numRows, int *colPartition,
                                      int *rowPartition, int *globalCols, Mat grad)
{
  PC_Multilevel *ml = (PC_Multilevel *) pc->data;
  int            numLocRows, numLocCols, ncols, numSlots;
  int           *cols;
  PetscScalar   *vals;
  int            part, col, locCol, row;
  int            ierr;

  PetscFunctionBegin;
  /* Allocation */
  numSlots = numParts + NUM_PART_ROW_DIV - 1;
  ierr = PetscMalloc(numSlots         * sizeof(int),    &ml->numPartitionRows[level]);                    CHKERRQ(ierr);
  ierr = PetscMalloc(NUM_PART_ROW_DIV * sizeof(int **), &ml->rowPartition[level]);                        CHKERRQ(ierr);
  ierr = PetscMalloc(numParts         * sizeof(int *),  &ml->rowPartition[level][PART_ROW_INT]);          CHKERRQ(ierr);
  ierr = PetscMalloc(1                * sizeof(int *),  &ml->rowPartition[level][PART_ROW_BD]);           CHKERRQ(ierr);
  ierr = PetscMalloc(1                * sizeof(int *),  &ml->rowPartition[level][PART_ROW_RES]);          CHKERRQ(ierr);
  PetscLogObjectMemory(pc, numSlots*sizeof(int) + NUM_PART_ROW_DIV*sizeof(int **) + numSlots*sizeof(int *));

  /* Create row partition */
  for(row = 0; row < numRows; row++) {
    /* Find partitions adjacent to this row */
    ierr = MatGetRow(grad, row, &ncols, &cols, &vals);                                                     CHKERRQ(ierr);
    ierr = PetscMemzero(ml->numPartitionRows[level], numParts * sizeof(int));                              CHKERRQ(ierr);
    for(col = 0; col < ncols; col++) {
      /* Could invert this mapping explicitly or do bisection */
      for(locCol = 0; locCol < numCols; locCol++)
        if (globalCols[locCol] == cols[col])
          break;
      if (locCol == numCols) SETERRQ(PETSC_ERR_ARG_CORRUPT, "Invalid boundary gradient");
      ml->numPartitionRows[level][colPartition[locCol]] = 1;
    }
    ierr = MatRestoreRow(grad, row, &ncols, &cols, &vals);                                                 CHKERRQ(ierr);

    /* Classify row */
    if (ncols == 0)
    {
      /* This is caused by rows with nonzeros only in full rank partitions, so that
         B_\Gamma (I - D^{-1} D) has a null row. */
      rowPartition[row] = 0;
      continue;
    }
    for(part = 0, numLocCols = 0; part < numParts; part++) {
      if (ml->numPartitionRows[level][part] > 0) {
        numLocCols++;
        rowPartition[row] = part;
      }
    }
    if (numLocCols == 2) {
      rowPartition[row] = numParts + PART_ROW_BD  - 1;
    } else if (numLocCols > 2) {
      rowPartition[row] = numParts + PART_ROW_RES - 1;
    } else if (numLocCols < 1) {
      SETERRQ(PETSC_ERR_ARG_CORRUPT, "Invalid boundary gradient");
    }
  }

  /* Copy rows */
  ierr = PetscMemzero(ml->numPartitionRows[level], (numParts + NUM_PART_ROW_DIV - 1) * sizeof(int));      CHKERRQ(ierr);
  for(row = 0; row < numRows; row++) {
    ml->numPartitionRows[level][rowPartition[row]]++;
  }
  for(part = 0; part < numParts; part++) {
    numLocRows = ml->numPartitionRows[level][part];
    if (numLocRows) {
      ierr = PetscMalloc(numLocRows * sizeof(int), &ml->rowPartition[level][PART_ROW_INT][part]);         CHKERRQ(ierr);
      PetscLogObjectMemory(pc, numLocRows * sizeof(int));
    }
    ml->numPartitionRows[level][part] = 0;
  }
  numLocRows = ml->numPartitionRows[level][numParts];
  if (numLocRows) {
    ierr = PetscMalloc(numLocRows * sizeof(int), &ml->rowPartition[level][PART_ROW_BD][0]);               CHKERRQ(ierr);
    PetscLogObjectMemory(pc, numLocRows * sizeof(int));
  }
  ml->numPartitionRows[level][numParts]   = 0;
  numLocRows = ml->numPartitionRows[level][numParts+1];
  if (numLocRows) {
    ierr = PetscMalloc(numLocRows * sizeof(int), &ml->rowPartition[level][PART_ROW_RES][0]);              CHKERRQ(ierr);
    PetscLogObjectMemory(pc, numLocRows * sizeof(int));
  }
  ml->numPartitionRows[level][numParts+1] = 0;
  for(row = 0; row < numRows; row++) {
    if (rowPartition[row] < numParts) {
      ml->rowPartition[level][PART_ROW_INT][rowPartition[row]][ml->numPartitionRows[level][rowPartition[row]]++] = row;
    } else if (rowPartition[row] == numParts) {
      ml->rowPartition[level][PART_ROW_BD][0][ml->numPartitionRows[level][rowPartition[row]]++]  = row;
    } else if (rowPartition[row] >  numParts) {
      ml->rowPartition[level][PART_ROW_RES][0][ml->numPartitionRows[level][rowPartition[row]]++] = row;
    }
  }

  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelFactorPartitions_Private"
int PCMultiLevelFactorPartitions_Private(PC pc, int level, int numParts, Mat grad)
{
  PC_Multilevel *ml = (PC_Multilevel *) pc->data;
  int            ncols;
  int           *cols;
  PetscScalar   *vals;
  PetscReal     *B;
  int            maxSize;
  int            numRows, numCols;
  int            part, row, col, locCol;
  int            ierr;

  PetscFunctionBegin;
  ierr = PC_MLLogEventBegin(PC_ML_ReduceFactor, pc, grad, 0, 0);                                          CHKERRQ(ierr);
  /* Allocation */
  for(part = 0, maxSize = 0; part < numParts; part++) {
    numRows = ml->numPartitionRows[level][part];
    numCols = ml->numPartitionCols[level][part];
    maxSize = PetscMax(maxSize, numRows*numCols);
  }
  if (maxSize == 0) {
    PetscFunctionReturn(2);
    /* SETERRQ(1, "Empty interior (increase partition size)"); */
  }
  ierr = PetscMalloc(numParts * sizeof(PetscReal **), &ml->factors[level]);                               CHKERRQ(ierr);
  PetscLogObjectMemory(pc, numParts * sizeof(PetscReal **));
  ierr = PetscMalloc(maxSize * sizeof(PetscReal), &B);                                                    CHKERRQ(ierr);
  
  for(part = 0; part < numParts; part++) {
    /* Allocation */
    numRows = ml->numPartitionRows[level][part];
    numCols = ml->numPartitionCols[level][part];
    ierr = PetscMalloc(NUM_FACT_DIV    * sizeof(PetscReal *), &ml->factors[level][part]);                 CHKERRQ(ierr);
    ierr = PetscMalloc(numCols         * sizeof(PetscReal),   &ml->factors[level][part][FACT_DINV]);      CHKERRQ(ierr);
    ierr = PetscMalloc(numCols*numCols * sizeof(PetscReal),   &ml->factors[level][part][FACT_V]);         CHKERRQ(ierr);
    PetscLogObjectMemory(pc, NUM_FACT_DIV * sizeof(double *) + (numCols + numCols*numCols)*sizeof(PetscReal));

    if (numRows > 0) {
      /* If numCols < numRows, LAPACK will leave a bogus values in the end of D^{-1} */
      if (numCols > numRows) {
        ierr = PetscMemzero(ml->factors[level][part][FACT_DINV], numCols * sizeof(PetscReal));            CHKERRQ(ierr);
      }

      if (PCMultiLevelDoQR_Private(pc, numRows, numCols) == PETSC_TRUE) {
        /* Allocation */
        ierr = PetscMalloc(numRows*numCols * sizeof(PetscReal), &ml->factors[level][part][FACT_QR]);      CHKERRQ(ierr);
        ierr = PetscMalloc(numCols         * sizeof(PetscReal), &ml->factors[level][part][FACT_TAU]);     CHKERRQ(ierr);
        ierr = PetscMalloc(numCols*numCols * sizeof(PetscReal), &ml->factors[level][part][FACT_U]);       CHKERRQ(ierr);
        PetscLogObjectMemory(pc, (numRows*numCols + numCols + numCols*numCols)*sizeof(PetscReal));

        /* Extract the interior gradient for this partition */
        ierr = PetscMemzero(ml->factors[level][part][FACT_QR], numRows*numCols * sizeof(PetscReal));      CHKERRQ(ierr);
        for(row = 0; row < numRows; row++) {
          ierr = MatGetRow(grad, ml->rowPartition[level][PART_ROW_INT][part][row], &ncols, &cols, &vals); CHKERRQ(ierr);
          for(col = 0; col < ncols; col++) {
            for(locCol = 0; locCol < numCols; locCol++) {
              if (cols[col] == ml->colPartition[level][part][locCol]) break;
            }
            if (locCol < numCols) {
              ml->factors[level][part][FACT_QR][numRows*locCol+row] = vals[col];
            }
          }
          ierr = MatRestoreRow(grad, ml->rowPartition[level][PART_ROW_INT][part][row], &ncols, &cols, &vals); CHKERRQ(ierr);
        }

        /* Do QR of B */
        LAgeqrf_(&numRows, &numCols, ml->factors[level][part][FACT_QR],
               &numRows, ml->factors[level][part][FACT_TAU], ml->svdWork, &ml->svdWorkLen, &ierr);
                                                                                                          CHKERRQ(ierr);

        /* Put R in B */
        ierr = PetscMemzero(B, numCols*numCols * sizeof(PetscReal));                                      CHKERRQ(ierr);
        for(col = 0; col < numCols; col++)
          for(row = 0; row <= col; row++)
            B[col*numCols+row] = ml->factors[level][part][FACT_QR][col*numRows+row];

        /* Do SVD of R */
#if defined(PETSC_MISSING_LAPACK_GESVD) 
  SETERRQ(PETSC_ERR_SUP,"GESVD - Lapack routine is unavilable.");
#else
        LAgesvd_("A", "A", &numCols, &numCols, B, &numCols,
                 ml->factors[level][part][FACT_DINV], ml->factors[level][part][FACT_U], &numCols,
                 ml->factors[level][part][FACT_V], &numCols, ml->svdWork, &ml->svdWorkLen, &ierr);        CHKERRQ(ierr);
#endif
      } else {
        /* Allocation */
        ierr = PetscMalloc(numRows*numRows * sizeof(PetscReal), &ml->factors[level][part][FACT_U]);       CHKERRQ(ierr);
        PetscLogObjectMemory(pc, numRows*numRows * sizeof(PetscReal));

        /* Extract the interior gradient for this partition */
        ierr = PetscMemzero(B, numRows*numCols * sizeof(PetscReal));                                      CHKERRQ(ierr);
        for(row = 0; row < numRows; row++) {
          ierr = MatGetRow(grad, ml->rowPartition[level][PART_ROW_INT][part][row], &ncols, &cols, &vals); CHKERRQ(ierr);
          for(col = 0; col < ncols; col++) {
            for(locCol = 0; locCol < numCols; locCol++) {
              if (cols[col] == ml->colPartition[level][part][locCol]) break;
            }
            if (locCol < numCols) {
              B[numRows*locCol+row] = vals[col];
            }
          }
          ierr = MatRestoreRow(grad, ml->rowPartition[level][PART_ROW_INT][part][row], &ncols, &cols, &vals); CHKERRQ(ierr);
        }

        /* Factor the gradient */
#if defined(PETSC_MISSING_LAPACK_GESVD) 
  SETERRQ(PETSC_ERR_SUP,"GESVD - Lapack routine is unavilable.");
#else
         LAgesvd_("A", "A", &numRows, &numCols, B, &numRows, ml->factors[level][part][FACT_DINV],
               ml->factors[level][part][FACT_U], &numRows, ml->factors[level][part][FACT_V], &numCols,
               ml->svdWork, &ml->svdWorkLen, &ierr);
#endif
                                                                                                          CHKERRQ(ierr);
      }

      /* Invert the singular values */
      for(col = 0; col < numCols; col++) {
        if (ml->factors[level][part][FACT_DINV][col] > ml->zeroTol)
          ml->factors[level][part][FACT_DINV][col] = 1.0/ml->factors[level][part][FACT_DINV][col];
      }
    } else {
      /* Create null singular values for D^{-1} and the identity for V */
      ierr = PetscMemzero(ml->factors[level][part][FACT_DINV], numCols         * sizeof(PetscReal));      CHKERRQ(ierr);
      ierr = PetscMemzero(ml->factors[level][part][FACT_V],    numCols*numCols * sizeof(PetscReal));      CHKERRQ(ierr);
      for(col = 0; col < numCols; col++)
        ml->factors[level][part][FACT_V][col*numCols+col] = 1.0;
    }
  }

  /* Cleanup */
  ierr = PetscFree(B);                                                                                    CHKERRQ(ierr);
  ierr = PC_MLLogEventEnd(PC_ML_ReduceFactor, pc, grad, 0, 0);                                            CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelExtractBoundaryGradient_Private"
int PCMultiLevelExtractBoundaryGradient_Private(PC pc, int level, int numParts, int colsRemain, int *newCols,
                                                PetscScalar *newVals, Mat grad)
{
  PC_Multilevel *ml = (PC_Multilevel *) pc->data;
  int            ncols;
  int           *cols;
  PetscScalar   *vals;
  PetscScalar   *temp;
  int           *rowLens;
  int            numBdRows, numResRows, rowsRemain, numCols, numVals, prevCols;
  int            part, row, newRow, col, locCol = 0;
  int            ierr;

  PetscFunctionBegin;
  ierr = PC_MLLogEventBegin(PC_ML_ReduceBdGradExtract, pc, grad, 0, 0);                                   CHKERRQ(ierr);
  /* Initialization */
  numBdRows  = ml->numPartitionRows[level][numParts];
  numResRows = ml->numPartitionRows[level][numParts+1];
  rowsRemain = numBdRows + numResRows;

  /* Get the nonzero structure */
  ierr = VecGetArray(ml->bdReduceVecs[level], &temp);                                                     CHKERRQ(ierr);
  rowLens = (int *) temp;
  for(row = 0; row < numBdRows; row++)
  {
    ierr = MatGetRow(grad, ml->rowPartition[level][PART_ROW_BD][0][row], &ncols, &cols, PETSC_NULL);      CHKERRQ(ierr);
    for(col = 0, numVals = 0; col < ncols; col++)
    {
      for(part = 0, prevCols = 0; part < numParts; part++)
      {
        numCols = ml->numPartitionCols[level][part];
        for(locCol = 0; locCol < numCols; locCol++)
          if (cols[col] == ml->colPartition[level][part][locCol])
            break;
        if (locCol < numCols)
          break;
        prevCols += numCols;
      }
      if (part < numParts)
        numVals++;
    }
    if (numVals == 0) SETERRQ(PETSC_ERR_ARG_CORRUPT, "Null boundary row in boundary gradient");
    rowLens[row] = numVals;
    ierr = MatRestoreRow(grad, ml->rowPartition[level][PART_ROW_BD][0][row], &ncols, &cols, PETSC_NULL);  CHKERRQ(ierr);
  }
  for(row = 0; row < numResRows; row++)
  {
    ierr = MatGetRow(grad, ml->rowPartition[level][PART_ROW_RES][0][row], &ncols, &cols, PETSC_NULL);     CHKERRQ(ierr);
    for(col = 0, numVals = 0; col < ncols; col++)
    {
      for(part = 0, prevCols = 0; part < numParts; part++)
      {
        numCols = ml->numPartitionCols[level][part];
        for(locCol = 0; locCol < numCols; locCol++)
          if (cols[col] == ml->colPartition[level][part][locCol])
            break;
        if (locCol < numCols)
          break;
        prevCols += numCols;
      }
      if (part < numParts)
        numVals++;
    }
    if (numVals == 0) SETERRQ(PETSC_ERR_ARG_CORRUPT, "Null residual row in boundary gradient");
    rowLens[row+numBdRows] = numVals;
    ierr = MatRestoreRow(grad, ml->rowPartition[level][PART_ROW_RES][0][row], &ncols, &cols, PETSC_NULL); CHKERRQ(ierr);
  }

  /* Create the boundary gradient B_\Gamma */
  ierr = MatCreateSeqAIJ(PETSC_COMM_SELF, rowsRemain, colsRemain, 0, rowLens, &ml->grads[level]);         CHKERRQ(ierr);
  ierr = MatSetOption(ml->grads[level], MAT_NEW_NONZERO_ALLOCATION_ERR);                                  CHKERRQ(ierr);
  ierr = VecRestoreArray(ml->bdReduceVecs[level], &temp);                                                 CHKERRQ(ierr);

  /* Extract boundary rows */
  for(row = 0; row < numBdRows; row++) {
    ierr = MatGetRow(grad, ml->rowPartition[level][PART_ROW_BD][0][row], &ncols, &cols, &vals);           CHKERRQ(ierr);
    for(col = 0, numVals = 0; col < ncols; col++)
    {
      for(part = 0, prevCols = 0; part < numParts; part++)
      {
        numCols = ml->numPartitionCols[level][part];
        for(locCol = 0; locCol < numCols; locCol++)
          if (cols[col] == ml->colPartition[level][part][locCol])
            break;
        if (locCol < numCols)
          break;
        prevCols += numCols;
      }
      if (part < numParts)
      {
        newCols[numVals] = locCol + prevCols;
        newVals[numVals] = vals[col];
        numVals++;
      }
    }
    ierr = MatSetValues(ml->grads[level], 1, &row, numVals, newCols, newVals, INSERT_VALUES);             CHKERRQ(ierr);
    ierr = MatRestoreRow(grad, ml->rowPartition[level][PART_ROW_BD][0][row], &ncols, &cols, &vals);       CHKERRQ(ierr);
  }

  /* Extract residual rows */
  for(row = 0, newRow = numBdRows; row < numResRows; row++, newRow++)
  {
    ierr = MatGetRow(grad, ml->rowPartition[level][PART_ROW_RES][0][row], &ncols, &cols, &vals);          CHKERRQ(ierr);
    for(col = 0, numVals = 0; col < ncols; col++)
    {
      for(part = 0, prevCols = 0; part < numParts; part++)
      {
        numCols = ml->numPartitionCols[level][part];
        for(locCol = 0; locCol < numCols; locCol++)
          if (cols[col] == ml->colPartition[level][part][locCol])
            break;
        if (locCol < numCols)
          break;
        prevCols += numCols;
      }
      if (part < numParts)
      {
        newCols[numVals] = locCol + prevCols;
        newVals[numVals] = vals[col];
        numVals++;
      }
    }
    ierr = MatSetValues(ml->grads[level], 1, &newRow, numVals, newCols, newVals, INSERT_VALUES);          CHKERRQ(ierr);
    ierr = MatRestoreRow(grad, ml->rowPartition[level][PART_ROW_RES][0][row], &ncols, &cols, &vals);      CHKERRQ(ierr);
  }

  ierr = MatAssemblyBegin(ml->grads[level], MAT_FINAL_ASSEMBLY);                                          CHKERRQ(ierr);
  ierr = MatAssemblyEnd(ml->grads[level], MAT_FINAL_ASSEMBLY);                                            CHKERRQ(ierr);
  ierr = PC_MLLogEventEnd(PC_ML_ReduceBdGradExtract, pc, grad, 0, 0);                                     CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelShrinkMesh_Private"
int PCMultiLevelShrinkMesh_Private(PC pc, int level, int numNewCols, int *colPartition, int numElements, int **oldMesh,
                                   int *numNewElements, int ***newMesh)
{
  PC_Multilevel *ml      = (PC_Multilevel *) pc->data;
  int            numCols = ml->numVertices[level];
  int           *adjCols;
  int            maxSize;
  int            col, oldCol, newCol, adjCol, count;
  int            elem;
  int            ierr;

  PetscFunctionBegin;
  ierr = PC_MLLogEventBegin(PC_ML_ReduceShrinkMesh, pc, 0, 0, 0);                                         CHKERRQ(ierr);
  ml->numMeshes++;

  /* Allocation */
  maxSize = PetscMax(PetscMin(oldMesh[MESH_OFFSETS][numCols], numNewCols*numNewCols + 1), 1);
  ierr = PetscMalloc(NUM_MESH_DIV   * sizeof(int *), newMesh);                                            CHKERRQ(ierr);
  ierr = PetscMalloc((numNewCols+1) * sizeof(int),   &(*newMesh)[MESH_OFFSETS]);                          CHKERRQ(ierr);
  ierr = PetscMalloc(maxSize        * sizeof(int),   &(*newMesh)[MESH_ADJ]);                              CHKERRQ(ierr);
  ierr = PetscMalloc((numNewCols+1) * sizeof(int),   &adjCols);                                           CHKERRQ(ierr);
  PetscLogObjectMemory(pc, NUM_MESH_DIV * sizeof(int *) + (numNewCols+1 + maxSize)*sizeof(int));
  /* Shrink adjacency lists */
  (*newMesh)[MESH_OFFSETS][0] = 0;
  for(col = 0, count = 0; col < numNewCols; col++) {
    ierr = PetscMemzero(adjCols, numNewCols * sizeof(int));                                               CHKERRQ(ierr);
    /* Put in self edge */
    (*newMesh)[MESH_ADJ][count++] = col;
    adjCols[col] = 1;
    /* Get all columns in this partition */
    for(oldCol = 0; oldCol < numCols; oldCol++) {
      if (colPartition[oldCol] == col) {
        for(adjCol = oldMesh[MESH_OFFSETS][oldCol]; adjCol < oldMesh[MESH_OFFSETS][oldCol+1]; adjCol++) {
          /* Check for adjacent column */
          newCol = colPartition[oldMesh[MESH_ADJ][adjCol]];
          /* Null partitions have number -1 */
          if ((newCol < 0) || (adjCols[newCol]))
            continue;
          (*newMesh)[MESH_ADJ][count++] = newCol;
          adjCols[newCol] = 1;
        }
      }
    }
    (*newMesh)[MESH_OFFSETS][col+1] = count;
  }
  ierr = PetscFree(adjCols);                                                                              CHKERRQ(ierr);

  /* Eliminate redundant elements */
  maxSize = PetscMax(PetscMin(numElements, PetscMax(numCols-3, 0)*2 + 1), 1);
  ierr = PetscMalloc(maxSize*3 * sizeof(int), &(*newMesh)[MESH_ELEM]);                                    CHKERRQ(ierr);
  PetscLogObjectMemory(pc, maxSize*3 * sizeof(int));
  for(elem = 0, *numNewElements = 0; elem < numElements; elem++) {
    if ((colPartition[oldMesh[MESH_ELEM][elem*3]-1]   != colPartition[oldMesh[MESH_ELEM][elem*3+1]-1]) &&
        (colPartition[oldMesh[MESH_ELEM][elem*3+1]-1] != colPartition[oldMesh[MESH_ELEM][elem*3+2]-1]) &&
        (colPartition[oldMesh[MESH_ELEM][elem*3+2]-1] != colPartition[oldMesh[MESH_ELEM][elem*3]-1])   &&
        (colPartition[oldMesh[MESH_ELEM][elem*3]-1]   >= 0)                                            &&
        (colPartition[oldMesh[MESH_ELEM][elem*3+1]-1] >= 0)                                            &&
        (colPartition[oldMesh[MESH_ELEM][elem*3+2]-1] >= 0))
    {
      (*newMesh)[MESH_ELEM][(*numNewElements)*3]   = colPartition[oldMesh[MESH_ELEM][elem*3]-1]   + 1;
      (*newMesh)[MESH_ELEM][(*numNewElements)*3+1] = colPartition[oldMesh[MESH_ELEM][elem*3+1]-1] + 1;
      (*newMesh)[MESH_ELEM][(*numNewElements)*3+2] = colPartition[oldMesh[MESH_ELEM][elem*3+2]-1] + 1;
      (*numNewElements)++;
    }
  }
  ierr = PC_MLLogEventEnd(PC_ML_ReduceShrinkMesh, pc, 0, 0, 0);                                           CHKERRQ(ierr);

  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelRowPartitionLocalToGlobal_Private"
int PCMultiLevelRowPartitionLocalToGlobal_Private(PC pc, int level)
{
  PC_Multilevel *ml = (PC_Multilevel *) pc->data;
  PetscScalar   *bdArray;
  int           *prevIndices;
  int           *rowIndices;
  int            numParts, numIntRows, numBdRows, numResRows;
  int            part, row;
  int            ierr;

  PetscFunctionBegin;
  if (level <= 0)
    PetscFunctionReturn(0);
  ierr = PC_MLLogEventBegin(PC_ML_ReduceBdGradRowPartLocalToGlobal, pc, 0, 0, 0);                         CHKERRQ(ierr);
  ierr = VecGetArray(ml->bdReduceVecs[level-1], &bdArray);                                                CHKERRQ(ierr);

  /* Load unreduced rows from last level into a work vector */
  prevIndices = (int *) bdArray;
  numParts   = ml->numPartitions[level-1];
  numBdRows  = ml->numPartitionRows[level-1][numParts];
  ierr = PetscMemcpy(prevIndices,           ml->rowPartition[level-1][PART_ROW_BD][0],  numBdRows  * sizeof(int)); CHKERRQ(ierr);
  numResRows = ml->numPartitionRows[level-1][numParts+1];
  ierr = PetscMemcpy(prevIndices+numBdRows, ml->rowPartition[level-1][PART_ROW_RES][0], numResRows * sizeof(int)); CHKERRQ(ierr);
  /* Interior edges */
  numParts = ml->numPartitions[level];
  for(part = 0; part < numParts; part++)
  {
    rowIndices = ml->rowPartition[level][PART_ROW_INT][part];
    numIntRows = ml->numPartitionRows[level][part];
    for(row = 0; row < numIntRows; row++)
      rowIndices[row] = prevIndices[rowIndices[row]];
  }
  /* Boundary edges */
  rowIndices = ml->rowPartition[level][PART_ROW_BD][0];
  numBdRows  = ml->numPartitionRows[level][numParts];
  for(row = 0; row < numBdRows; row++)
    rowIndices[row] = prevIndices[rowIndices[row]];
  /* Residual edges */
  rowIndices = ml->rowPartition[level][PART_ROW_RES][0];
  numResRows = ml->numPartitionRows[level][numParts+1];
  for(row = 0; row < numResRows; row++)
    rowIndices[row] = prevIndices[rowIndices[row]];

  ierr = VecRestoreArray(ml->bdReduceVecs[level-1], &bdArray);                                            CHKERRQ(ierr);
  ierr = PC_MLLogEventEnd(PC_ML_ReduceBdGradRowPartLocalToGlobal, pc, 0, 0, 0);                           CHKERRQ(ierr);

  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelReduce"
/*@C PCMultiLevelReduce
	This function creates a multilevel factorization of the gradient matrix
  which may be used for preconditioning a linear solve. This is called by
  PCSetUp() when the PC_MULTILEVEL type is selected.

  Input Parameters:
. pc - The preconditioner context

  Level: intermediate

.keywords multilevel
.seealso 
@*/
int PCMultiLevelReduce(PC pc)
{
  PC_Multilevel *ml;
  PetscTruth     adj;
  Mat            grad;
  int            ncols;
  int           *cols;
  int           *newCols;
  PetscScalar   *vals;
  PetscScalar   *newVals;
  PetscScalar   *colReduceArray;
  PetscScalar   *colReduceArray2;
  int           *colPartition;
  int           *rowPartition;
  int           *globalCols;
  int           *localCols;
  int           *globalPart;
  int            rowsRemain, colsRemain, newColsRemain;
  int            numParts, numNewParts, numCols, numNullCols, numVals;
  int            numBdRows, numResRows;
  int            rankDef;
  PetscScalar    val;
  int            size, sizeMax;
  int            invalidNull;
  int            level, part, newPart, row, col, col2, locCol, nullCol, vCol, loop, anyAgain;
#define NEW_BD_GRAD
#ifdef NEW_BD_GRAD
  int           *firstCol, *colPart;
#ifdef NEW_BD_GRAD_CHECK
  int            oldPart, oldLocCol;
#endif
#endif
  int            ierr, ierr2;

  PetscFunctionBegin;
  PetscValidHeaderSpecific(pc, PC_COOKIE);
  ml = (PC_Multilevel *) pc->data;

  /* Create P and Z such that

         P^T B Z = / D \
                   \ 0 /

     which is almost and SVD
  */
  if (ml->useMath == PETSC_FALSE) {
    /* Initialization */
    /* ierr = MatConvert(ml->locB, MATSAME, &grad);                                                       CHKERRQ(ierr); */
    grad           = ml->locB;
    colsRemain     = ml->numCols;
    rowsRemain     = ml->numRows;
    ml->rank       = 0;
    sizeMax        = PetscMax(ml->numRows, ml->numCols);
    ml->svdWorkLen = PetscMax(3*PetscMin(ml->numRows,ml->numCols) + PetscMax(ml->numRows,ml->numCols),
                              5*PetscMin(ml->numRows,ml->numCols) - 4);

    /* Allocation */
    ierr = PetscMalloc(ml->maxLevels  * sizeof(int),        &ml->numPartitions);                          CHKERRQ(ierr);
    ierr = PetscMalloc(ml->maxLevels  * sizeof(int *),      &ml->numPartitionCols);                       CHKERRQ(ierr);
    ierr = PetscMalloc(ml->maxLevels  * sizeof(int **),     &ml->colPartition);                           CHKERRQ(ierr);
    ierr = PetscMalloc(ml->maxLevels  * sizeof(int *),      &ml->numPartitionRows);                       CHKERRQ(ierr);
    ierr = PetscMalloc(ml->maxLevels  * sizeof(int ***),    &ml->rowPartition);                           CHKERRQ(ierr);
    ierr = PetscMalloc(ml->maxLevels  * sizeof(double ***), &ml->factors);                                CHKERRQ(ierr);
    ierr = PetscMalloc(ml->maxLevels  * sizeof(Mat),        &ml->grads);                                  CHKERRQ(ierr);
    ierr = PetscMalloc(ml->maxLevels  * sizeof(Vec),        &ml->bdReduceVecs);                           CHKERRQ(ierr);
    ierr = PetscMalloc(ml->maxLevels  * sizeof(Vec),        &ml->colReduceVecs);                          CHKERRQ(ierr);
    ierr = PetscMalloc(ml->maxLevels  * sizeof(Vec),        &ml->colReduceVecs2);                         CHKERRQ(ierr);
    ierr = PetscMalloc(ml->svdWorkLen * sizeof(double),     &ml->svdWork);                                CHKERRQ(ierr);
    ierr = PetscMalloc(ml->numCols    * sizeof(int),        &ml->range);                                  CHKERRQ(ierr);
    PetscLogObjectMemory(pc, ml->maxLevels*(sizeof(int) + sizeof(int *) + sizeof(int **) + sizeof(int *) + sizeof(int ***) + sizeof(double ***) +
                                            sizeof(Mat) + sizeof(Vec)*3) + ml->svdWorkLen * sizeof(double) + ml->numCols * sizeof(int));
    ierr = PetscMalloc(ml->numCols * sizeof(int), &colPartition);                                         CHKERRQ(ierr);
    ierr = PetscMalloc(ml->numCols * sizeof(int), &globalCols);                                           CHKERRQ(ierr);
    ierr = PetscMalloc(sizeMax     * sizeof(int), &rowPartition);                                         CHKERRQ(ierr);

    for(col = 0; col < ml->numCols; col++) {
      ml->range[col]  = -1;
      globalCols[col] = col;
    }

    /* Loop until the graph has ml->QRthresh nodes or all rows have been reduced */
    for(level = 0, anyAgain = 0; (colsRemain > ml->QRthresh) && (rowsRemain > 0); level++) {
      if (level >= ml->maxLevels) SETERRQ(PETSC_ERR_ARG_SIZ, "Exceeded maximum level");

      loop = 2;
      while(loop) {
        /* Partition the columns (nodes): Give a list with the partition number of each column */
        ierr = PCMultiLevelPartitionMesh_Private(pc, level, ml->meshes[level], &numParts, colPartition);  CHKERRQ(ierr);
        ml->numPartitions[level] = numParts;

        /* Get the global columns in each partition */
        ierr = PC_MLLogEventBegin(PC_ML_ReducePartitionRowCol, pc, 0, 0, 0);                              CHKERRQ(ierr);
        ierr = PCMultiLevelPartitionCols_Private(pc, level, numParts, colsRemain, colPartition, globalCols);
                                                                                                          CHKERRQ(ierr);

        /* Partition rows (edges): Give the local rows in each partition, then boundary and residual rows */
        ierr = PCMultiLevelPartitionRows_Private(pc, level, numParts, colsRemain, rowsRemain, colPartition,
                                                 rowPartition, globalCols, grad);
                                                                                                          CHKERRQ(ierr);
        ierr = PC_MLLogEventEnd(PC_ML_ReducePartitionRowCol, pc, 0, 0, 0);                                CHKERRQ(ierr);

        /* Factor the interior of each partition */
        loop = PCMultiLevelFactorPartitions_Private(pc, level, numParts, grad);                           CHKERRQ(ierr);

        if (loop == 2) {
          /* Free memory from bad partition */
          for(part = 0; part < numParts; part++) {
            if (ml->numPartitionCols[level][part] > 0)
              {ierr = PetscFree(ml->colPartition[level][part]);                                           CHKERRQ(ierr);}
            if (ml->numPartitionRows[level][part] > 0)
              {ierr = PetscFree(ml->rowPartition[level][PART_ROW_INT][part]);                             CHKERRQ(ierr);}
          }
          if (ml->numPartitionRows[level][ml->numPartitions[level]] > 0)
            {ierr = PetscFree(ml->rowPartition[level][PART_ROW_BD][0]);                                   CHKERRQ(ierr);}
          if (ml->numPartitionRows[level][ml->numPartitions[level]+1] > 0)
            {ierr = PetscFree(ml->rowPartition[level][PART_ROW_RES][0]);                                  CHKERRQ(ierr);}
          ierr = PetscFree(ml->rowPartition[level][PART_ROW_INT]);                                        CHKERRQ(ierr);
          ierr = PetscFree(ml->rowPartition[level][PART_ROW_BD]);                                         CHKERRQ(ierr);
          ierr = PetscFree(ml->rowPartition[level][PART_ROW_RES]);                                        CHKERRQ(ierr);
          ierr = PetscFree(ml->colPartition[level]);                                                      CHKERRQ(ierr);
          ierr = PetscFree(ml->rowPartition[level]);                                                      CHKERRQ(ierr);
          ierr = PetscFree(ml->numPartitionCols[level]);                                                  CHKERRQ(ierr);
          ierr = PetscFree(ml->numPartitionRows[level]);                                                  CHKERRQ(ierr);
          /* Increase partition size */
          PetscLogInfo(pc, "PCMultiLevelReduce: Increasing partition size from %d to %d\n", ml->partSize, ml->partSize*2);
          if (ml->partSize > ml->numCols) SETERRQ(PETSC_ERR_PLIB, "Partition has grown too large.");
          ml->partSize *= 2;
        } else if (loop != 0) {
                                                                                                          CHKERRQ(loop);
        }
      }

      /* QR factor boundary matrices */

      /* Create the boundary gradient B_\Gamma and workspace for applying it */
      ierr = PC_MLLogEventBegin(PC_ML_ReduceBdGrad, pc, 0, 0, 0);                                         CHKERRQ(ierr);
      numBdRows  = ml->numPartitionRows[level][numParts];
      numResRows = ml->numPartitionRows[level][numParts+1];
      rowsRemain = numBdRows + numResRows;
      ierr = VecCreateSeq(PETSC_COMM_SELF, rowsRemain, &ml->bdReduceVecs[level]);                         CHKERRQ(ierr);
      ierr = VecCreateSeq(PETSC_COMM_SELF, colsRemain, &ml->colReduceVecs[level]);                        CHKERRQ(ierr);
      ierr = VecDuplicate(ml->colReduceVecs[level], &ml->colReduceVecs2[level]);                          CHKERRQ(ierr);

      /* Workspace for matrix reduction */
      ierr = VecGetArray(ml->colReduceVecs[0],  &colReduceArray);                                         CHKERRQ(ierr);
      ierr = VecGetArray(ml->colReduceVecs2[0], &colReduceArray2);                                        CHKERRQ(ierr);
      newCols = (int *) colReduceArray;
      newVals = ml->svdWork;

      /* Extract the boundary gradient B_\Gamma in CSR format */
      ierr = PCMultiLevelExtractBoundaryGradient_Private(pc, level, numParts, colsRemain, newCols, newVals, grad);
                                                                                                          CHKERRQ(ierr);

      /* Convert the local (per level) numbering to the global (original) numbering */
      ierr = PCMultiLevelRowPartitionLocalToGlobal_Private(pc, level);                                    CHKERRQ(ierr);

      /* Determine the columns active at the next level and fix up colPartition[] */
      globalPart = (int *) colReduceArray2;
      localCols  = rowPartition;
      for(part = 0; part < numParts; part++) globalCols[part] = -1;
      for(part = 0, newPart = 0, newColsRemain = 0, numNewParts = numParts; part < numParts; part++, newPart++) {
        for(col = 0, rankDef = 0; col < ml->numPartitionCols[level][part]; col++) {
          if (ml->factors[level][part][FACT_DINV][col] < ml->zeroTol) {
            newColsRemain++;
            rankDef++;

            if (rankDef == 1) {
              globalCols[newPart] = ml->colPartition[level][part][col];
              globalPart[newPart] = part;
              localCols[newPart]  = col;
            } else {
              globalCols[numNewParts] = ml->colPartition[level][part][col];
              globalPart[numNewParts] = part;
              localCols[numNewParts]  = col;

              /* Put the column in its own partition */
              numCols = ml->numPartitionCols[level][part] - (col+1);
              for(col2 = colsRemain-1; numCols >= 0; col2--) {
                if (colPartition[col2] == newPart) {
                  numCols--;
                  if (numCols < 0)
                    colPartition[col2] = numNewParts++;
                }
              }
            }
          } else {
            ml->rank++;
#ifdef PETSC_USE_BOPT_g
            if (ml->range[ml->colPartition[level][part][col]] != -1) {
              PetscLogInfo(pc, "Level %d partition %d col %d: column %d has already been reduced\n",
                       level, part, col, ml->colPartition[level][part][col]);
              SETERRQ(PETSC_ERR_ARG_CORRUPT, "Invalid reduction");
            }
            if ((ml->rowPartition[level][PART_ROW_INT][part][col] < 0) ||
                (ml->rowPartition[level][PART_ROW_INT][part][col] > ml->numRows)) {
              SETERRQ(PETSC_ERR_ARG_CORRUPT, "Invalid row index in range");
            }
#endif
            ml->range[ml->colPartition[level][part][col]] = ml->rowPartition[level][PART_ROW_INT][part][col];
          }
        }
#ifdef PETSC_USE_BOPT_g
        PetscLogInfo(pc, "Lvl: %d Partition: %d has rank deficiency %d\n", level, part, rankDef);
#endif
        if (rankDef == 0) {
          /* Eliminate this partition */
          for(col = 0; col < colsRemain; col++) {
            if (colPartition[col] == newPart) {
              colPartition[col] = -1;
            } else if (colPartition[col] > newPart) {
              colPartition[col]--;
            }
          }
          /* Shift everything down */
          size = PetscMax(numNewParts, numParts) - part;
          ierr = PetscMemmove(&globalCols[newPart], &globalCols[newPart+1], size * sizeof(int));          CHKERRQ(ierr);
          ierr = PetscMemmove(&globalPart[newPart], &globalPart[newPart+1], size * sizeof(int));          CHKERRQ(ierr);
          ierr = PetscMemmove(&localCols[newPart],  &localCols[newPart+1],  size * sizeof(int));          CHKERRQ(ierr);
          newPart--;
          numNewParts--;
        }
      }

#ifdef PETSC_USE_BOPT_g
      for(col = 0, row = 0; col < ml->numCols; col++) {
        if (ml->range[col] == -1) {
          row++;
        } else if (ml->range[col] < 0) {
          SETERRQ(PETSC_ERR_ARG_CORRUPT, "Invalid range space");
        }
      }
      if (row != newColsRemain) SETERRQ(PETSC_ERR_ARG_CORRUPT, "Invalid range space");

      for(vCol = 0; vCol < newColsRemain; vCol++) {
        if (globalCols[vCol] < 0) {
          PetscPrintf(PETSC_COMM_SELF, "Invalid column in reduced boundary gradient\n");
          PetscPrintf(PETSC_COMM_SELF, "Lvl: %d newRows: %d newCols: %d\n", level, rowsRemain, newColsRemain);
          for(col = 0; col < newColsRemain; col++) {
            PetscPrintf(PETSC_COMM_SELF, "  globalCols[%d]: %d\n", col, globalCols[col]);
          }
          SETERRQ(PETSC_ERR_ARG_CORRUPT, "Negative column");
        }
      }
#endif
      ierr = PC_MLLogEventEnd(PC_ML_ReduceBdGrad, pc, 0, 0, 0);                                           CHKERRQ(ierr);

      /* Transform B_\Gamma to B_\Gamma V (I - D^{-1} D) */
      ierr = PC_MLLogEventBegin(PC_ML_CreateBdGrad, pc, 0, 0, 0);                                         CHKERRQ(ierr);
      if (level > 0) {
        ierr = MatDestroy(grad);                                                                          CHKERRQ(ierr);
      }
      ierr = MatCreateSeqAIJ(PETSC_COMM_SELF, rowsRemain, ml->numCols, newColsRemain, PETSC_NULL, &grad);
      if (ierr) {
        PetscPrintf(PETSC_COMM_SELF, "Interface Gradient failure:");
        PetscPrintf(PETSC_COMM_SELF, "  level: %d", level);
        PetscPrintf(PETSC_COMM_SELF, "  rows: %d cols: %d\n", rowsRemain, ml->numCols);
        PetscPrintf(PETSC_COMM_SELF, "  nonzeroes per row: %d\n", newColsRemain);
                                                                                                          CHKERRQ(ierr);
      }
#ifdef NEW_BD_GRAD
      ierr = PetscMalloc((numParts+1) * sizeof(int), &firstCol);                                          CHKERRQ(ierr);
      ierr = PetscMalloc(ml->numCols  * sizeof(int), &colPart);                                           CHKERRQ(ierr);
      for(part = 1, firstCol[0] = 0; part <= numParts; part++) {
        firstCol[part] = firstCol[part-1] + ml->numPartitionCols[level][part-1];
        for(col = 0; col < ml->numPartitionCols[level][part-1]; col++) {
          colPart[col+firstCol[part-1]] = part-1;
        }
      }
#endif
      for(row = 0; row < rowsRemain; row++) {
        ierr = MatGetRow(ml->grads[level], row, &ncols, &cols, &vals);                                    CHKERRQ(ierr);
        /* Sparse VecDot with the active columns of V */
        for(vCol = 0, numVals = 0; vCol < newColsRemain; vCol++) {
          /* Calculate the dot product */
          for(col = 0, adj = PETSC_FALSE, val = 0.0; col < ncols; col++) {
            /* Get the local column number */
#ifndef NEW_BD_GRAD
            locCol = cols[col];
            for(part = 0; part < PetscMin(numParts, globalPart[vCol]+2); part++) {
              numCols = ml->numPartitionCols[level][part];
              if (locCol >= numCols) {
                locCol -= numCols;
                continue;
              }
              break;
            }
#else
            part    = colPart[cols[col]];
            locCol  = cols[col] - firstCol[part];
            numCols = ml->numPartitionCols[level][part];
#ifdef NEW_BD_GRAD_CHECK
            /***** CHECK THE RESULT ***/
            oldLocCol = cols[col];
            for(oldPart = 0; oldPart < PetscMin(numParts, globalPart[vCol]+2); oldPart++) {
              numCols = ml->numPartitionCols[level][oldPart];
              if (oldLocCol >= numCols) {
                oldLocCol -= numCols;
                continue;
              }
              break;
            }
            if ((oldPart == globalPart[vCol]) && (oldPart   != part))   SETERRQ2(PETSC_ERR_PLIB, "Invalid partition %d should be %d", part, oldPart);
            if ((oldPart == globalPart[vCol]) && (oldLocCol != locCol)) SETERRQ2(PETSC_ERR_PLIB, "Invalid local column %d should be %d", locCol, oldLocCol);
#endif
#endif
            /* Multiply by the correct element of V (I - D^{-1} D) */
            if (part == globalPart[vCol]) {
              val += vals[col]*ml->factors[level][part][FACT_V][locCol*numCols+localCols[vCol]];
              adj  = PETSC_TRUE;
            }
          }
          /* Store the dot product in B_\Gamma V (I - D^{-1} D) */
          if (adj == PETSC_TRUE) {
            newCols[numVals] = globalCols[vCol];
            newVals[numVals] = val;
            numVals++;
          }
        }
        /* This is an overestimate */
        PetscLogFlops(numVals*ncols);
#if 0
#ifdef PETSC_USE_BOPT_g
        if (numVals == 0) PetscLogInfo(pc, "Null row %d in reduced boundary gradient matrix\n", row);
#endif
#endif
        ierr = MatSetValues(grad, 1, &row, numVals, newCols, newVals, INSERT_VALUES);                     CHKERRQ(ierr);
        ierr = MatRestoreRow(ml->grads[level], row, &ncols, &cols, &vals);                                CHKERRQ(ierr);
      }
      ierr = MatAssemblyBegin(grad, MAT_FINAL_ASSEMBLY);                                                  CHKERRQ(ierr);
      ierr = MatAssemblyEnd(grad, MAT_FINAL_ASSEMBLY);                                                    CHKERRQ(ierr);
#ifdef NEW_BD_GRAD
      ierr = PetscFree(firstCol);                                                                         CHKERRQ(ierr);
      ierr = PetscFree(colPart);                                                                          CHKERRQ(ierr);
#endif
      ierr = PC_MLLogEventEnd(PC_ML_CreateBdGrad, pc, 0, 0, 0);                                           CHKERRQ(ierr);

      /* Construct the coarse graph */
      ml->numVertices[level+1] = newColsRemain;
      ierr = PCMultiLevelShrinkMesh_Private(pc, level, newColsRemain, colPartition, ml->numElements[level],
                                            ml->meshes[level], &ml->numElements[level+1], &ml->meshes[level+1]);
                                                                                                          CHKERRQ(ierr);

      /* Update variables */
      colsRemain = newColsRemain;
      ierr = VecRestoreArray(ml->colReduceVecs[0],  &colReduceArray);                                     CHKERRQ(ierr);
      ierr = VecRestoreArray(ml->colReduceVecs2[0], &colReduceArray2);                                    CHKERRQ(ierr);

      /* Visualize factorization */
      ierr = PCMultiLevelReduceView(pc, level, colsRemain, colPartition,
                                    ((colsRemain > ml->QRthresh) && (rowsRemain > 0)), &anyAgain);
                                                                                                          CHKERRQ(ierr);
    }
    while(anyAgain) {
      ierr = PCMultiLevelReduceView(pc, -level, 0, colPartition, 0, &anyAgain);                           CHKERRQ(ierr);
    }
    ml->numLevels = level;

    /* Cleanup */
    ierr = PetscFree(colPartition);                                                                       CHKERRQ(ierr);
    ierr = PetscFree(rowPartition);                                                                       CHKERRQ(ierr);
    ierr = PetscFree(globalCols);                                                                         CHKERRQ(ierr);
    if (ml->numLevels > 0) {
      ierr = MatDestroy(grad);                                                                            CHKERRQ(ierr);
    }

    /* Workspace allocation */
    ml->interiorWorkLen = 1;
    for(level = 0; level < ml->numLevels; level++)
      for(part = 0; part < ml->numPartitions[level]; part++)
        ml->interiorWorkLen = PetscMax(ml->interiorWorkLen, ml->numPartitionRows[level][part]);
    ierr = PetscMalloc(ml->interiorWorkLen * sizeof(double), &ml->interiorWork);                          CHKERRQ(ierr);
    ierr = PetscMalloc(ml->interiorWorkLen * sizeof(double), &ml->interiorWork2);                         CHKERRQ(ierr);
    PetscLogObjectMemory(pc, ml->interiorWorkLen*2 * sizeof(double));

    /* Get the null space rows and compress the range */
    ml->globalRank     = ml->rank;
    ml->numLocNullCols = ml->numCols - ml->rank;
    ierr = PetscMalloc(PetscMax(ml->numLocNullCols, 1) * sizeof(int), &ml->nullCols);                     CHKERRQ(ierr);
    PetscLogObjectMemory(pc, PetscMax(ml->numLocNullCols, 1) * sizeof(int));
    ml->nullCols[0]    = -1;
    for(col = 0, nullCol = 0, numNullCols = 0, ierr2 = 0; nullCol < ml->numCols; col++, nullCol++) {
      if (ml->range[col] < 0) {
        if (numNullCols == ml->numLocNullCols) {
          int rank;

          MPI_Comm_rank(pc->comm, &rank);
          PetscPrintf(PETSC_COMM_SELF, "[%d]Range %d, Null %d:\n", rank, ml->rank, ml->numLocNullCols);
          for(col = 0; col < ml->numCols; col++) PetscPrintf(PETSC_COMM_SELF, " %d", ml->range[col]);
          PetscPrintf(PETSC_COMM_SELF, "\n");
          ierr2 = 1;
          break;
        }
        ml->nullCols[numNullCols++] = nullCol;
        ierr = PetscMemmove(&ml->range[col], &ml->range[col+1], (ml->numCols - (col+1)) * sizeof(int));   CHKERRQ(ierr);
        col--;
      }
    }
    ierr = MPI_Allreduce(&ierr2, &invalidNull, 1, MPI_INT, MPI_LOR, pc->comm);                            CHKERRQ(ierr);
    if (invalidNull) SETERRQ(PETSC_ERR_ARG_CORRUPT, "Invalid null space");
    if (numNullCols != ml->numLocNullCols) SETERRQ(PETSC_ERR_ARG_CORRUPT, "Invalid number of null space columns");
    if (ml->numLocNullCols + ml->rank != ml->numCols) SETERRQ(PETSC_ERR_ARG_CORRUPT, "Invalid space decomposition");

#ifdef PETSC_USE_BOPT_g
    ierr = PCValidQ_Multilevel(pc);                                                                       CHKERRQ(ierr);
#endif
  } else {
    Mesh mesh;

    /* Cleanup */
    ierr = GridGetMesh(ml->grid, &mesh);                                                                  CHKERRQ(ierr);
    ierr = MeshDestroyLocalCSR(mesh, ml->meshes[0][MESH_OFFSETS], ml->meshes[0][MESH_ADJ], ml->vertices); CHKERRQ(ierr);
#ifdef PETSC_HAVE_MATHEMATICA
    ierr = ViewerMathematicaReduce(ml->mathViewer, pc, ml->QRthresh);                                     CHKERRQ(ierr);
    ierr = PetscFree(ml->meshes[0][MESH_ELEM]);                                                           CHKERRQ(ierr);
    ierr = PetscFree(ml->meshes[0]);                                                                      CHKERRQ(ierr);
    ierr = PetscFree(ml->meshes);                                                                         CHKERRQ(ierr);
    ierr = PetscFree(ml->numElements);                                                                    CHKERRQ(ierr);
    ierr = PetscFree(ml->numVertices);                                                                    CHKERRQ(ierr);
    /* ierr = PCConvert_Multilevel(pc);                                                                   CHKERRQ(ierr); */
#else
    SETERRQ(PETSC_ERR_SUP, " ");
#endif
  }
  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelFilterVertexCoords_Tutte"
int PCMultiLevelFilterVertexCoords_Tutte(PC pc, int numCols, double *vertices, int **mesh)
{
  PetscRandom gen;
  double      residual   = 1.0;
  double      tol        = 1.0e-8;
  int         iter       = 0;
  int         maxIter    = 10000;
  int         xminNode, xmaxNode, yminNode, ymaxNode;
  double      xmin, xmax, ymin, ymax;
  PetscReal   x, y;
  int         node, neighbor;
  int         ierr;

  PetscFunctionBegin;
  if (numCols < 3) PetscFunctionReturn(0);
  /* Fix boundary */
  ierr = PetscRandomCreate(PETSC_COMM_SELF, RANDOM_DEFAULT, &gen);                                        CHKERRQ(ierr);
  ierr = PetscRandomSetInterval(gen, 0, numCols);                                                         CHKERRQ(ierr);
  xmin = xmax = vertices[0];
  ymin = ymax = vertices[1];
  xminNode = xmaxNode = yminNode = ymaxNode = 0;
  for(node = 0; node < numCols; node++) {
    if (vertices[node*2] < xmin) {
      xmin     = vertices[node*2];
      xminNode = node;
    }
    if (vertices[node*2] > xmax) {
      xmax     = vertices[node*2];
      xmaxNode = node;
    }
    if (vertices[node*2+1] < ymin) {
      ymin     = vertices[node*2+1];
      yminNode = node;
    }
    if (vertices[node*2+1] > ymax) {
      ymax     = vertices[node*2+1];
      ymaxNode = node;
    }
  }
  if ((xmaxNode == yminNode) && (xminNode == ymaxNode)) {
    while((xmaxNode == yminNode) || (xmaxNode == xminNode)) {
      ierr = PetscRandomGetValue(gen, &x);                                                                CHKERRQ(ierr);
      xmaxNode = (int) floor(x);
    }
  }
  if ((xmaxNode == ymaxNode) && (xminNode == yminNode)) {
    while((xmaxNode == ymaxNode) || (xmaxNode == xminNode)) {
      ierr = PetscRandomGetValue(gen, &x);                                                                CHKERRQ(ierr);
      xmaxNode = (int) floor(x);
    }
  }
  ierr = PetscRandomDestroy(gen);                                                                         CHKERRQ(ierr);
  /* First pass: Jacobi Relaxation */
  while((residual > tol) && (iter < maxIter)) {
    for(node = 0; node < numCols; node++) {
      if ((node == xminNode) || (node == xmaxNode) || (node == yminNode) || (node == ymaxNode)) continue;
      vertices[node*2]   = 0.0;
      vertices[node*2+1] = 0.0;
      for(neighbor = mesh[MESH_OFFSETS][node]; neighbor < mesh[MESH_OFFSETS][node+1]; neighbor++) {
        if (mesh[MESH_ADJ][neighbor] != node) {
          vertices[node*2]   += vertices[mesh[MESH_ADJ][neighbor]*2];
          vertices[node*2+1] += vertices[mesh[MESH_ADJ][neighbor]*2+1];
        }
      }
      /* Remember to get rid of self edge */
      vertices[node*2]   /= mesh[MESH_OFFSETS][node+1] - mesh[MESH_OFFSETS][node] - 1;
      vertices[node*2+1] /= mesh[MESH_OFFSETS][node+1] - mesh[MESH_OFFSETS][node] - 1;
    }
    for(node = 0, residual = 0.0; node < numCols; node++) {
      if ((node == xminNode) || (node == xmaxNode) || (node == yminNode) || (node == ymaxNode)) continue;
      x = 0.0;
      y = 0.0;
      for(neighbor = mesh[MESH_OFFSETS][node]; neighbor < mesh[MESH_OFFSETS][node+1]; neighbor++) {
        if (mesh[MESH_ADJ][neighbor] != node) {
          x += vertices[mesh[MESH_ADJ][neighbor]*2];
          y += vertices[mesh[MESH_ADJ][neighbor]*2+1];
        }
      }
      x /= mesh[MESH_OFFSETS][node+1] - mesh[MESH_OFFSETS][node] - 1;
      y /= mesh[MESH_OFFSETS][node+1] - mesh[MESH_OFFSETS][node] - 1;
      residual += PetscAbsReal(x - vertices[node*2]) + PetscAbsReal(y - vertices[node*2+1]);
    }
    residual /= numCols;
    iter++;
  }
  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelCalcVertexCoords_Private"
int PCMultiLevelCalcVertexCoords_Private(PC pc, int numCols, int numNewCols, int *colPartition)
{
  PC_Multilevel *ml = (PC_Multilevel *) pc->data;
  double        *vertices;
  int           *counts;
  int            node;
  int            ierr;

  PetscFunctionBegin;
  if (numNewCols > 0) {
    ierr = PetscMalloc(numNewCols*2 * sizeof(double), &vertices);                                         CHKERRQ(ierr);
    ierr = PetscMalloc(numNewCols   * sizeof(int),    &counts);                                           CHKERRQ(ierr);
    ierr = PetscMemzero(vertices, numNewCols*2 * sizeof(double));                                         CHKERRQ(ierr);
    ierr = PetscMemzero(counts,   numNewCols   * sizeof(int));                                            CHKERRQ(ierr);
    for(node = 0; node < numCols; node++) {
      if (colPartition[node] >= 0) {
        vertices[colPartition[node]*2]   += ml->vertices[node*2];
        vertices[colPartition[node]*2+1] += ml->vertices[node*2+1];
        counts[colPartition[node]]++;
      }
    }
    for(node = 0; node < numNewCols; node++) {
      vertices[node*2]   /= counts[node];
      vertices[node*2+1] /= counts[node];
    }
    ierr = PetscFree(ml->vertices);                                                                       CHKERRQ(ierr);
    ierr = PetscFree(counts);                                                                             CHKERRQ(ierr);
    ml->vertices = vertices;
  }
  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelView_Mesh"
int PCMultiLevelView_Mesh(PC pc, int numCols, int numElements, int **mesh)
{
  PC_Multilevel *ml     = (PC_Multilevel *) pc->data;
  PetscViewer    viewer = ml->factorizationViewer;
  Mesh           smallMesh;
  int           *faces;
  int            face;
  int            ierr;

  PetscFunctionBegin;
  if (numElements > 0) {
    ierr = PetscMalloc(numElements*3 * sizeof(int), &faces);                                              CHKERRQ(ierr);
    ierr = PetscMemcpy(faces, mesh[MESH_ELEM], numElements*3 * sizeof(int));                              CHKERRQ(ierr);
    for(face = 0; face < numElements*3; face++) {
      faces[face]--;
    }
  }
  ierr = MeshCreateTriangular2DCSR(pc->comm, numCols, numElements, ml->vertices, mesh[MESH_OFFSETS], mesh[MESH_ADJ], faces, &smallMesh);
                                                                                                          CHKERRQ(ierr);
  if (numElements > 0) {
    ierr = PetscFree(faces);                                                                              CHKERRQ(ierr);
  }
  ierr = MeshView(smallMesh, viewer);                                                                     CHKERRQ(ierr);
  ierr = MeshDestroy(smallMesh);                                                                          CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelView_Private"
int PCMultiLevelView_Private(PC pc, PetscViewer viewer, int level, int numCols, int numElements, int **mesh)
{
  PetscDraw draw;
  char      title[256];
  int       maxLevel;
  int       ierr;

  PetscFunctionBegin;
  ierr = PetscViewerDrawClear(viewer);                                                                    CHKERRQ(ierr);
  ierr = PetscViewerDrawGetDraw(viewer, 0, &draw);                                                        CHKERRQ(ierr);
  ierr = MPI_Allreduce(&level, &maxLevel, 1, MPI_INT, MPI_MAX, pc->comm);                                 CHKERRQ(ierr);
  sprintf(title, "ML Mesh, Level %d", maxLevel);
  ierr = PetscDrawSetTitle(draw, title);                                                                  CHKERRQ(ierr);
  ierr = PCMultiLevelView_Mesh(pc, numCols, numElements, mesh);                                           CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef  __FUNCT__
#define __FUNCT__ "PCMultiLevelView_Private"
int PCMultiLevelReduceView(PC pc, int level, int numNewCols, int *colPartition, int again, int *anyAgain)
{
  PC_Multilevel *ml     = (PC_Multilevel *) pc->data;
  PetscViewer    viewer = ml->factorizationViewer;
  int          **newMesh;
  int            numCols, numNewElements;
  int            ierr;

  PetscFunctionBegin;
  if (viewer != PETSC_NULL) {
    if (level < 0) {
      ierr = PCMultiLevelView_Private(pc, viewer, -1, numNewCols, 0, ml->meshes[-level]);                 CHKERRQ(ierr);
    } else {
      numCols        = ml->numVertices[level];
      numNewElements = ml->numElements[level+1];
      newMesh        = ml->meshes[level+1];
      ierr = PCMultiLevelCalcVertexCoords_Private(pc, numCols, numNewCols, colPartition);                 CHKERRQ(ierr);
      ierr = PCMultiLevelFilterVertexCoords_Tutte(pc, numNewCols, ml->vertices, newMesh);                 CHKERRQ(ierr);
      ierr = PCMultiLevelView_Private(pc, viewer, level+1, numNewCols, numNewElements, newMesh);          CHKERRQ(ierr);
    }
    ierr = PetscViewerFlush(viewer);                                                                      CHKERRQ(ierr);
    ierr = MPI_Allreduce(&again, anyAgain, 1, MPI_INT, MPI_LOR, pc->comm);                                CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}
