MatrixMatrixMultiply_NonBlocking(int n, double *a, double *b,double *c, MPI_Comm comm) 
{ 
  int i, j, nlocal; 
  double *a_buffers[2], *b_buffers[2]; 
  int npes, dims[2], periods[2]; 
  int myrank, my2drank, mycoords[2]; 
  int uprank, downrank, leftrank, rightrank, coords[2]; 
  int shiftsource, shiftdest; 
  MPI_Status status; 
  MPI_Comm comm_2d; 
  MPI_Request reqs[4]; 

  /* Get the communicator related information */ 
  MPI_Comm_size(comm, &npes); 
  MPI_Comm_rank(comm, &myrank); 

  /* Set up the Cartesian topology */ 
  dims[0] = dims[1] = sqrt(npes); 

  /* Set the periods for wraparound connections */ 
  periods[0] = periods[1] = 1; 

  /* Create the Cartesian topology, with rank reordering */ 
  MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d); 

  /* Get the rank and coordinates with respect to the new topology */ 
  MPI_Comm_rank(comm_2d, &my2drank); 
  MPI_Cart_coords(comm_2d, my2drank, 2, mycoords); 

  /* Compute ranks of the up and left shifts */ 
  MPI_Cart_shift(comm_2d, 0, -1, &rightrank, &leftrank); 
  MPI_Cart_shift(comm_2d, 1, -1, &downrank, &uprank); 

  /* Determine the dimension of the local matrix block */ 
  nlocal = n/dims[0]; 

  /* Setup the a_buffers and b_buffers arrays */ 
  a_buffers[0] = a; 
  a_buffers[1] = (double *)malloc(nlocal*nlocal*sizeof(double)); 
  b_buffers[0] = b; 
  b_buffers[1] = (double *)malloc(nlocal*nlocal*sizeof(double)); 

  /* Perform the initial matrix alignment. First for A and then for B */ 
  MPI_Cart_shift(comm_2d, 0, -mycoords[0], &shiftsource, &shiftdest); 
  MPI_Sendrecv_replace(a_buffers[0], nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status); 

  MPI_Cart_shift(comm_2d, 1, -mycoords[1], &shiftsource, &shiftdest); 
  MPI_Sendrecv_replace(b_buffers[0], nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status); 

  /* Get into the main computation loop */ 
  for (i=0; i<dims[0]; i++) { 
    MPI_Isend(a_buffers[i%2], nlocal*nlocal, MPI_DOUBLE,leftrank, 1, comm_2d, &reqs[0]); 
    MPI_Isend(b_buffers[i%2], nlocal*nlocal, MPI_DOUBLE,uprank, 1, comm_2d, &reqs[1]); 
    MPI_Irecv(a_buffers[(i+1)%2], nlocal*nlocal, MPI_DOUBLE,rightrank, 1, comm_2d, &reqs[2]); 
    MPI_Irecv(b_buffers[(i+1)%2], nlocal*nlocal, MPI_DOUBLE,downrank, 1, comm_2d, &reqs[3]); 

    /* c = c + a*b */ 
    MatrixMultiply(nlocal, a_buffers[i%2], b_buffers[i%2], c); 

    for (j=0; j<4; j++) 
      MPI_Wait(&reqs[j], &status); 
  } 

  /* Restore the original distribution of a and b */ 
  MPI_Cart_shift(comm_2d, 0, +mycoords[0], &shiftsource, &shiftdest); 
  MPI_Sendrecv_replace(a_buffers[i%2], nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status); 

  MPI_Cart_shift(comm_2d, 1, +mycoords[1], &shiftsource, &shiftdest); 
  MPI_Sendrecv_replace(b_buffers[i%2], nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status); 

  MPI_Comm_free(&comm_2d); /* Free up communicator */ 

  free(a_buffers[1]); 
  free(b_buffers[1]); 
}
