You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

197 lines
7.0 KiB
C#

1 month ago
using MathNet.Numerics.LinearAlgebra;
using MathNet.Numerics.LinearAlgebra.Double;
namespace Validation.Algorithms
{
/// <summary>
/// 稳健马氏距离计算器(基于 Minimum Covariance Determinant, MCD
/// python sklearn MinCovDet 就是采用此方法C# 没有相应的库,只能自己实现
///
public class RobustMahalanobis
{
private readonly int _h; // 子集大小 (如果未指定,会自动设置为 (n+p+1)/2)
private readonly int _maxIterations; // MCD 最大迭代次数
private readonly double _convergenceThreshold; // 收敛判定阈值
private readonly double _regularization; // 协方差正则化系数(避免奇异)
private readonly int _numRandomStarts; // 随机子集的尝试次数(越多越稳健)
public RobustMahalanobis(
int? h = null,
int maxIterations = 100,
double convergenceThreshold = 1e-6,
double regularization = 1e-8,
int numRandomStarts = 50) // 默认多尝试一些随机初始子集
{
_maxIterations = maxIterations;
_convergenceThreshold = convergenceThreshold;
_regularization = regularization;
_numRandomStarts = numRandomStarts;
_h = h ?? 0;
}
/// <summary>
/// 计算所有样本的稳健马氏距离
/// </summary>
public double[] Calculate(Matrix<double> data)
{
var (mean, covariance) = ComputeRobustEstimates(data);
return CalculateDistances(data, mean, covariance);
}
/// <summary>
/// 计算稳健均值和协方差 (MCD)
/// </summary>
private (Vector<double> mean, Matrix<double> covariance) ComputeRobustEstimates(Matrix<double> data)
{
int n = data.RowCount;
int p = data.ColumnCount;
int h = _h > 0 ? _h : Math.Max(p + 1, (n + p + 1) / 2);
if (h <= p || h > n)
{
throw new ArgumentException($"h must satisfy p < h <= n, where h={h}, p={p}, n={n}");
}
var random = new Random(42);
// 多次随机初始化,取行列式最小的解
(Vector<double> mean, Matrix<double> cov)? best = null;
double bestDet = double.MaxValue;
for (int i = 0; i < _numRandomStarts; i++)
{
var indices = GetRandomIndices(n, h, random);
var subset = ExtractRows(data, indices);
var estimate = RunMCD(data, subset, h);
var covReg = AddRegularization(estimate.cov, _regularization);
double det = covReg.Determinant();
if (det > 0 && det < bestDet)
{
bestDet = det;
best = (estimate.mean, covReg);
}
}
return best ?? ComputeMeanCovariance(data); // 兜底:普通均值+协方差
}
/// <summary>
/// 在一个初始子集上运行 MCD 算法
/// </summary>
private (Vector<double> mean, Matrix<double> cov) RunMCD(Matrix<double> data, Matrix<double> initialSubset, int h)
{
var (mean, cov) = ComputeMeanCovariance(initialSubset);
for (int iter = 0; iter < _maxIterations; iter++)
{
var distances = CalculateDistances(data, mean, cov);
var bestIndices = GetBestIndices(distances, h);
var newSubset = ExtractRows(data, bestIndices);
var (newMean, newCov) = ComputeMeanCovariance(newSubset);
double detOld = cov.Determinant();
double detNew = newCov.Determinant();
if (HasConverged(detOld, detNew))
return (newMean, newCov);
mean = newMean;
cov = newCov;
}
return (mean, cov); // 达到迭代上限
}
private bool HasConverged(double prevDet, double currentDet)
{
return Math.Abs(prevDet - currentDet) < _convergenceThreshold ||
double.IsNaN(currentDet) || currentDet <= 0;
}
private static int[] GetBestIndices(double[] distances, int h)
{
return distances
.Select((dist, idx) => new { dist, idx })
.OrderBy(x => x.dist)
.Take(h)
.Select(x => x.idx)
.ToArray();
}
private static int[] GetRandomIndices(int n, int h, Random random)
{
return Enumerable.Range(0, n).OrderBy(_ => random.Next()).Take(h).ToArray();
}
private static (Vector<double> mean, Matrix<double> cov) ComputeMeanCovariance(Matrix<double> subset)
{
int n = subset.RowCount;
int p = subset.ColumnCount;
if (n <= 1)
{
return (Vector<double>.Build.Dense(p), DenseMatrix.CreateIdentity(p));
}
var mean = Vector<double>.Build.Dense(p, i => subset.Column(i).Average());
var centered = DenseMatrix.Create(n, p, (i, j) => subset[i, j] - mean[j]);
var cov = centered.Transpose() * centered / (n - 1);
return (mean, cov);
}
private static double[] CalculateDistances(Matrix<double> data, Vector<double> mean, Matrix<double> cov)
{
var invCov = ComputeInverseCovariance(cov);
return data.EnumerateRows().Select(row => CalculateDistance(row, mean, invCov)).ToArray();
}
private static double CalculateDistance(Vector<double> point, Vector<double> mean, Matrix<double> invCov)
{
var diff = point - mean;
return Math.Sqrt(Math.Max(0, diff * invCov * diff));
}
private static Matrix<double> ComputeInverseCovariance(Matrix<double> cov)
{
try
{
return cov.Inverse();
}
catch
{
return ComputePseudoInverse(cov);
}
}
private static Matrix<double> ComputePseudoInverse(Matrix<double> cov)
{
var svd = cov.Svd(true);
var u = svd.U;
var s = svd.S;
var vt = svd.VT;
var invS = DenseMatrix.Create(s.Count, s.Count,
(i, j) => i == j ? (s[i] > 1e-10 ? 1.0 / s[i] : 0.0) : 0.0);
return vt.Transpose() * invS * u.Transpose();
}
private static Matrix<double> AddRegularization(Matrix<double> cov, double reg)
{
var result = cov.Clone();
for (int i = 0; i < cov.RowCount; i++)
{
result[i, i] += reg;
}
return result;
}
private static Matrix<double> ExtractRows(Matrix<double> data, int[] indices)
{
return DenseMatrix.Create(indices.Length, data.ColumnCount, (i, j) => data[indices[i], j]);
}
}
}