Optimized DenseLU solve
/// <summary>
/// Solves A*X=B for X using LU factorization.
/// </summary>
/// <param name="columnsOfB">The number of columns of B.</param>
/// <param name="a">The square matrix A.</param>
/// <param name="order">The order of the square matrix <paramref name="a"/>.</param>
/// <param name="b">On entry the B matrix; on exit the X matrix.</param>
/// <remarks>This is equivalent to the GETRF and GETRS LAPACK routines.</remarks>
public static unsafe void LUSolveUnsafe(int columnsOfB, double[] aIn, int order, double[] bIn)
{
#region Initialize
if (aIn == null)
throw new ArgumentNullException("a");
if (bIn == null)
throw new ArgumentNullException("b");
int i;
int j;
int k;
double* a = stackalloc double[aIn.Length];
double* b = stackalloc double[bIn.Length];
int* ipiv = stackalloc int[order];
// Marshal
for (i = 0; i < aIn.Length; i++)
a[i] = aIn[i];
for (i = 0; i < bIn.Length; i++)
b[i] = bIn[i];
#endregion
#region Factorize
// Initialize the pivot matrix to the identity permutation.
for (i = 0; i < order; i++)
ipiv[i] = i;
// Outer loop
for (j = 0; j < order; j++)
{
int indexj = j * order;
int indexjj = indexj + j;
// Apply previous transformations.
for (i = 0; i < order; i++)
{
// Most of the time is spent in the following dot product.
int kmax = Math.Min(i, j);
double s = 0.0;
for (k = 0; k < kmax; k++)
s += a[(k * order) + i] * a[indexj + k];
a[indexj + i] -= s;
}
// Find pivot and exchange if necessary.
int p = j;
for (i = j + 1; i < order; i++)
if (Math.Abs(a[indexj + i]) > Math.Abs(a[indexj + p]))
p = i;
if (p != j)
{
for (k = 0; k < order; k++)
{
var temp = a[k * order + p];
a[k * order + p] = a[k * order + j];
a[k * order + j] = temp;
}
ipiv[j] = p;
}
// Compute multipliers.
//if (j < order & a[indexjj] != 0.0)
if (a[indexjj] != 0.0)
for (i = j + 1; i < order; i++)
a[indexj + i] /= a[indexjj];
}
#endregion
#region LUSolveFactored
// Compute the column vector P*B
for (i = 0; i < order; i++)
{
if (ipiv[i] == i)
continue;
int p = ipiv[i];
for (j = 0; j < columnsOfB; j++)
{
var temp = b[j * order + p];
b[j * order + p] = b[j * order + i];
b[j * order + i] = temp;
}
}
// Solve L*Y = P*B
for (k = 0; k < order; k++)
{
var korder = k * order;
for (i = k + 1; i < order; i++)
for (j = 0; j < columnsOfB; j++)
{
var index = j * order;
b[i + index] -= b[k + index] * a[i + korder];
}
}
// Solve U*X = Y;
for (k = order - 1; k >= 0; k--)
{
var korder = k + (k * order);
for (j = 0; j < columnsOfB; j++)
b[k + (j * order)] /= a[korder];
korder = k * order;
for (i = 0; i < k; i++)
for (j = 0; j < columnsOfB; j++)
b[i + j * order] -= b[k + j * order] * a[i + korder];
}
#endregion
// Marshal
for (i = 0; i < bIn.Length; i++)
bIn[i] = b[i];
}
/// <summary>
/// Solves A*X=B for X using LU factorization.
/// </summary>
/// <param name="columnsOfB">The number of columns of B.</param>
/// <param name="a">The square matrix A.</param>
/// <param name="order">The order of the square matrix <paramref name="a"/>.</param>
/// <param name="b">On entry the B matrix; on exit the X matrix.</param>
/// <remarks>This is equivalent to the GETRF and GETRS LAPACK routines.</remarks>
public static void LUSolveSafe(int columnsOfB, double[] aIn, int order, double[] bIn)
{
#region Initialize
if (aIn == null)
throw new ArgumentNullException("a");
if (bIn == null)
throw new ArgumentNullException("b");
int i;
int j;
int k;
double[] a = new double[aIn.Length];
int[] ipiv = new int[order];
Array.Copy(aIn, a, aIn.Length);
#endregion
#region Factorize
// Initialize the pivot matrix to the identity permutation.
for (i = 0; i < order; i++)
ipiv[i] = i;
// Outer loop
for (j = 0; j < order; j++)
{
int indexj = j * order;
int indexjj = indexj + j;
// Apply previous transformations.
for (i = 0; i < order; i++)
{
// Most of the time is spent in the following dot product.
int kmax = Math.Min(i, j);
double s = 0.0;
for (k = 0; k < kmax; k++)
s += a[(k * order) + i] * a[indexj + k];
a[indexj + i] -= s;
}
// Find pivot and exchange if necessary.
int p = j;
for (i = j + 1; i < order; i++)
if (Math.Abs(a[indexj + i]) > Math.Abs(a[indexj + p]))
p = i;
if (p != j)
{
for (k = 0; k < order; k++)
{
var temp = a[k * order + p];
a[k * order + p] = a[k * order + j];
a[k * order + j] = temp;
}
ipiv[j] = p;
}
// Compute multipliers.
//if (j < order & a[indexjj] != 0.0)
if (a[indexjj] != 0.0)
for (i = j + 1; i < order; i++)
a[indexj + i] /= a[indexjj];
}
#endregion
#region LUSolveFactored
// Compute the column vector P*B
for (i = 0; i < order; i++)
{
if (ipiv[i] == i)
continue;
int p = ipiv[i];
for (j = 0; j < columnsOfB; j++)
{
var temp = bIn[j * order + p];
bIn[j * order + p] = bIn[j * order + i];
bIn[j * order + i] = temp;
}
}
// Solve L*Y = P*B
for (k = 0; k < order; k++)
{
var korder = k * order;
for (i = k + 1; i < order; i++)
for (j = 0; j < columnsOfB; j++)
{
var index = j * order;
bIn[i + index] -= bIn[k + index] * a[i + korder];
}
}
// Solve U*X = Y;
for (k = order - 1; k >= 0; k--)
{
var korder = k + (k * order);
for (j = 0; j < columnsOfB; j++)
bIn[k + (j * order)] /= a[korder];
korder = k * order;
for (i = 0; i < k; i++)
for (j = 0; j < columnsOfB; j++)
bIn[i + j * order] -= bIn[k + j * order] * a[i + korder];
}
#endregion
}
-
Doug commented
Designed for HPC and the performance obsessed. We hit this class very hard with inner loops running billions of times by the end of a run. Tested up to A = 1000x1000 and B = 1000x8. It will run into stack out of memory issues for untested sizes.
The stackalloc on the unsafe version is a significant heap (garbage collector) saver, and eliminating the array bounds checking in the rest of the code leads to significant performance gains.
In a non-toy application this gives us a 20% performance boost.