You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

197 lines
7.0 KiB
C#

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

using MathNet.Numerics.LinearAlgebra;
using MathNet.Numerics.LinearAlgebra.Double;
namespace Validation.Algorithms
{
/// <summary>
/// 稳健马氏距离计算器(基于 Minimum Covariance Determinant, MCD
/// python sklearn MinCovDet 就是采用此方法C# 没有相应的库,只能自己实现
///
public class RobustMahalanobis
{
private readonly int _h; // 子集大小 (如果未指定,会自动设置为 (n+p+1)/2)
private readonly int _maxIterations; // MCD 最大迭代次数
private readonly double _convergenceThreshold; // 收敛判定阈值
private readonly double _regularization; // 协方差正则化系数(避免奇异)
private readonly int _numRandomStarts; // 随机子集的尝试次数(越多越稳健)
public RobustMahalanobis(
int? h = null,
int maxIterations = 100,
double convergenceThreshold = 1e-6,
double regularization = 1e-8,
int numRandomStarts = 50) // 默认多尝试一些随机初始子集
{
_maxIterations = maxIterations;
_convergenceThreshold = convergenceThreshold;
_regularization = regularization;
_numRandomStarts = numRandomStarts;
_h = h ?? 0;
}
/// <summary>
/// 计算所有样本的稳健马氏距离
/// </summary>
public double[] Calculate(Matrix<double> data)
{
var (mean, covariance) = ComputeRobustEstimates(data);
return CalculateDistances(data, mean, covariance);
}
/// <summary>
/// 计算稳健均值和协方差 (MCD)
/// </summary>
private (Vector<double> mean, Matrix<double> covariance) ComputeRobustEstimates(Matrix<double> data)
{
int n = data.RowCount;
int p = data.ColumnCount;
int h = _h > 0 ? _h : Math.Max(p + 1, (n + p + 1) / 2);
if (h <= p || h > n)
{
throw new ArgumentException($"h must satisfy p < h <= n, where h={h}, p={p}, n={n}");
}
var random = new Random(42);
// 多次随机初始化,取行列式最小的解
(Vector<double> mean, Matrix<double> cov)? best = null;
double bestDet = double.MaxValue;
for (int i = 0; i < _numRandomStarts; i++)
{
var indices = GetRandomIndices(n, h, random);
var subset = ExtractRows(data, indices);
var estimate = RunMCD(data, subset, h);
var covReg = AddRegularization(estimate.cov, _regularization);
double det = covReg.Determinant();
if (det > 0 && det < bestDet)
{
bestDet = det;
best = (estimate.mean, covReg);
}
}
return best ?? ComputeMeanCovariance(data); // 兜底:普通均值+协方差
}
/// <summary>
/// 在一个初始子集上运行 MCD 算法
/// </summary>
private (Vector<double> mean, Matrix<double> cov) RunMCD(Matrix<double> data, Matrix<double> initialSubset, int h)
{
var (mean, cov) = ComputeMeanCovariance(initialSubset);
for (int iter = 0; iter < _maxIterations; iter++)
{
var distances = CalculateDistances(data, mean, cov);
var bestIndices = GetBestIndices(distances, h);
var newSubset = ExtractRows(data, bestIndices);
var (newMean, newCov) = ComputeMeanCovariance(newSubset);
double detOld = cov.Determinant();
double detNew = newCov.Determinant();
if (HasConverged(detOld, detNew))
return (newMean, newCov);
mean = newMean;
cov = newCov;
}
return (mean, cov); // 达到迭代上限
}
private bool HasConverged(double prevDet, double currentDet)
{
return Math.Abs(prevDet - currentDet) < _convergenceThreshold ||
double.IsNaN(currentDet) || currentDet <= 0;
}
private static int[] GetBestIndices(double[] distances, int h)
{
return distances
.Select((dist, idx) => new { dist, idx })
.OrderBy(x => x.dist)
.Take(h)
.Select(x => x.idx)
.ToArray();
}
private static int[] GetRandomIndices(int n, int h, Random random)
{
return Enumerable.Range(0, n).OrderBy(_ => random.Next()).Take(h).ToArray();
}
private static (Vector<double> mean, Matrix<double> cov) ComputeMeanCovariance(Matrix<double> subset)
{
int n = subset.RowCount;
int p = subset.ColumnCount;
if (n <= 1)
{
return (Vector<double>.Build.Dense(p), DenseMatrix.CreateIdentity(p));
}
var mean = Vector<double>.Build.Dense(p, i => subset.Column(i).Average());
var centered = DenseMatrix.Create(n, p, (i, j) => subset[i, j] - mean[j]);
var cov = centered.Transpose() * centered / (n - 1);
return (mean, cov);
}
private static double[] CalculateDistances(Matrix<double> data, Vector<double> mean, Matrix<double> cov)
{
var invCov = ComputeInverseCovariance(cov);
return data.EnumerateRows().Select(row => CalculateDistance(row, mean, invCov)).ToArray();
}
private static double CalculateDistance(Vector<double> point, Vector<double> mean, Matrix<double> invCov)
{
var diff = point - mean;
return Math.Sqrt(Math.Max(0, diff * invCov * diff));
}
private static Matrix<double> ComputeInverseCovariance(Matrix<double> cov)
{
try
{
return cov.Inverse();
}
catch
{
return ComputePseudoInverse(cov);
}
}
private static Matrix<double> ComputePseudoInverse(Matrix<double> cov)
{
var svd = cov.Svd(true);
var u = svd.U;
var s = svd.S;
var vt = svd.VT;
var invS = DenseMatrix.Create(s.Count, s.Count,
(i, j) => i == j ? (s[i] > 1e-10 ? 1.0 / s[i] : 0.0) : 0.0);
return vt.Transpose() * invS * u.Transpose();
}
private static Matrix<double> AddRegularization(Matrix<double> cov, double reg)
{
var result = cov.Clone();
for (int i = 0; i < cov.RowCount; i++)
{
result[i, i] += reg;
}
return result;
}
private static Matrix<double> ExtractRows(Matrix<double> data, int[] indices)
{
return DenseMatrix.Create(indices.Length, data.ColumnCount, (i, j) => data[indices[i], j]);
}
}
}