|
|
using MathNet.Numerics.LinearAlgebra;
|
|
|
using MathNet.Numerics.LinearAlgebra.Double;
|
|
|
|
|
|
namespace Validation.Algorithms
|
|
|
{
|
|
|
/// <summary>
|
|
|
/// 稳健马氏距离计算器(基于 Minimum Covariance Determinant, MCD)
|
|
|
/// python sklearn MinCovDet 就是采用此方法,C# 没有相应的库,只能自己实现
|
|
|
///
|
|
|
public class RobustMahalanobis
|
|
|
{
|
|
|
private readonly int _h; // 子集大小 (如果未指定,会自动设置为 (n+p+1)/2)
|
|
|
private readonly int _maxIterations; // MCD 最大迭代次数
|
|
|
private readonly double _convergenceThreshold; // 收敛判定阈值
|
|
|
private readonly double _regularization; // 协方差正则化系数(避免奇异)
|
|
|
private readonly int _numRandomStarts; // 随机子集的尝试次数(越多越稳健)
|
|
|
|
|
|
public RobustMahalanobis(
|
|
|
int? h = null,
|
|
|
int maxIterations = 100,
|
|
|
double convergenceThreshold = 1e-6,
|
|
|
double regularization = 1e-8,
|
|
|
int numRandomStarts = 50) // 默认多尝试一些随机初始子集
|
|
|
{
|
|
|
_maxIterations = maxIterations;
|
|
|
_convergenceThreshold = convergenceThreshold;
|
|
|
_regularization = regularization;
|
|
|
_numRandomStarts = numRandomStarts;
|
|
|
_h = h ?? 0;
|
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
|
/// 计算所有样本的稳健马氏距离
|
|
|
/// </summary>
|
|
|
public double[] Calculate(Matrix<double> data)
|
|
|
{
|
|
|
var (mean, covariance) = ComputeRobustEstimates(data);
|
|
|
return CalculateDistances(data, mean, covariance);
|
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
|
/// 计算稳健均值和协方差 (MCD)
|
|
|
/// </summary>
|
|
|
private (Vector<double> mean, Matrix<double> covariance) ComputeRobustEstimates(Matrix<double> data)
|
|
|
{
|
|
|
int n = data.RowCount;
|
|
|
int p = data.ColumnCount;
|
|
|
int h = _h > 0 ? _h : Math.Max(p + 1, (n + p + 1) / 2);
|
|
|
|
|
|
if (h <= p || h > n)
|
|
|
{
|
|
|
throw new ArgumentException($"h must satisfy p < h <= n, where h={h}, p={p}, n={n}");
|
|
|
}
|
|
|
|
|
|
var random = new Random(42);
|
|
|
|
|
|
// 多次随机初始化,取行列式最小的解
|
|
|
(Vector<double> mean, Matrix<double> cov)? best = null;
|
|
|
double bestDet = double.MaxValue;
|
|
|
|
|
|
for (int i = 0; i < _numRandomStarts; i++)
|
|
|
{
|
|
|
var indices = GetRandomIndices(n, h, random);
|
|
|
var subset = ExtractRows(data, indices);
|
|
|
var estimate = RunMCD(data, subset, h);
|
|
|
|
|
|
var covReg = AddRegularization(estimate.cov, _regularization);
|
|
|
double det = covReg.Determinant();
|
|
|
|
|
|
if (det > 0 && det < bestDet)
|
|
|
{
|
|
|
bestDet = det;
|
|
|
best = (estimate.mean, covReg);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return best ?? ComputeMeanCovariance(data); // 兜底:普通均值+协方差
|
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
|
/// 在一个初始子集上运行 MCD 算法
|
|
|
/// </summary>
|
|
|
private (Vector<double> mean, Matrix<double> cov) RunMCD(Matrix<double> data, Matrix<double> initialSubset, int h)
|
|
|
{
|
|
|
var (mean, cov) = ComputeMeanCovariance(initialSubset);
|
|
|
|
|
|
for (int iter = 0; iter < _maxIterations; iter++)
|
|
|
{
|
|
|
var distances = CalculateDistances(data, mean, cov);
|
|
|
var bestIndices = GetBestIndices(distances, h);
|
|
|
var newSubset = ExtractRows(data, bestIndices);
|
|
|
var (newMean, newCov) = ComputeMeanCovariance(newSubset);
|
|
|
|
|
|
double detOld = cov.Determinant();
|
|
|
double detNew = newCov.Determinant();
|
|
|
|
|
|
if (HasConverged(detOld, detNew))
|
|
|
return (newMean, newCov);
|
|
|
|
|
|
mean = newMean;
|
|
|
cov = newCov;
|
|
|
}
|
|
|
|
|
|
return (mean, cov); // 达到迭代上限
|
|
|
}
|
|
|
|
|
|
private bool HasConverged(double prevDet, double currentDet)
|
|
|
{
|
|
|
return Math.Abs(prevDet - currentDet) < _convergenceThreshold ||
|
|
|
double.IsNaN(currentDet) || currentDet <= 0;
|
|
|
}
|
|
|
|
|
|
private static int[] GetBestIndices(double[] distances, int h)
|
|
|
{
|
|
|
return distances
|
|
|
.Select((dist, idx) => new { dist, idx })
|
|
|
.OrderBy(x => x.dist)
|
|
|
.Take(h)
|
|
|
.Select(x => x.idx)
|
|
|
.ToArray();
|
|
|
}
|
|
|
|
|
|
private static int[] GetRandomIndices(int n, int h, Random random)
|
|
|
{
|
|
|
return Enumerable.Range(0, n).OrderBy(_ => random.Next()).Take(h).ToArray();
|
|
|
}
|
|
|
|
|
|
private static (Vector<double> mean, Matrix<double> cov) ComputeMeanCovariance(Matrix<double> subset)
|
|
|
{
|
|
|
int n = subset.RowCount;
|
|
|
int p = subset.ColumnCount;
|
|
|
|
|
|
if (n <= 1)
|
|
|
{
|
|
|
return (Vector<double>.Build.Dense(p), DenseMatrix.CreateIdentity(p));
|
|
|
}
|
|
|
|
|
|
var mean = Vector<double>.Build.Dense(p, i => subset.Column(i).Average());
|
|
|
var centered = DenseMatrix.Create(n, p, (i, j) => subset[i, j] - mean[j]);
|
|
|
var cov = centered.Transpose() * centered / (n - 1);
|
|
|
return (mean, cov);
|
|
|
}
|
|
|
|
|
|
private static double[] CalculateDistances(Matrix<double> data, Vector<double> mean, Matrix<double> cov)
|
|
|
{
|
|
|
var invCov = ComputeInverseCovariance(cov);
|
|
|
return data.EnumerateRows().Select(row => CalculateDistance(row, mean, invCov)).ToArray();
|
|
|
}
|
|
|
|
|
|
private static double CalculateDistance(Vector<double> point, Vector<double> mean, Matrix<double> invCov)
|
|
|
{
|
|
|
var diff = point - mean;
|
|
|
return Math.Sqrt(Math.Max(0, diff * invCov * diff));
|
|
|
}
|
|
|
|
|
|
private static Matrix<double> ComputeInverseCovariance(Matrix<double> cov)
|
|
|
{
|
|
|
try
|
|
|
{
|
|
|
return cov.Inverse();
|
|
|
}
|
|
|
catch
|
|
|
{
|
|
|
return ComputePseudoInverse(cov);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
private static Matrix<double> ComputePseudoInverse(Matrix<double> cov)
|
|
|
{
|
|
|
var svd = cov.Svd(true);
|
|
|
var u = svd.U;
|
|
|
var s = svd.S;
|
|
|
var vt = svd.VT;
|
|
|
|
|
|
var invS = DenseMatrix.Create(s.Count, s.Count,
|
|
|
(i, j) => i == j ? (s[i] > 1e-10 ? 1.0 / s[i] : 0.0) : 0.0);
|
|
|
|
|
|
return vt.Transpose() * invS * u.Transpose();
|
|
|
}
|
|
|
|
|
|
private static Matrix<double> AddRegularization(Matrix<double> cov, double reg)
|
|
|
{
|
|
|
var result = cov.Clone();
|
|
|
for (int i = 0; i < cov.RowCount; i++)
|
|
|
{
|
|
|
result[i, i] += reg;
|
|
|
}
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
private static Matrix<double> ExtractRows(Matrix<double> data, int[] indices)
|
|
|
{
|
|
|
return DenseMatrix.Create(indices.Length, data.ColumnCount, (i, j) => data[indices[i], j]);
|
|
|
}
|
|
|
}
|
|
|
}
|