You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kev/Drawer/DataValidator/Rule/Data/MCDValidationRule.cs

154 lines
4.2 KiB
C#

using MathNet.Numerics.LinearAlgebra;
using MathNet.Numerics.LinearAlgebra.Double;
using System;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Validation.Algorithms;
using Validation.Core;
namespace Validation.Rule.Data
{
/// <summary>
///
/// </summary>
public class MCDValidationRule : IValidationRule
{
string IValidationRule.RuleName => "MCD";
string IValidationRule.Description => "检测 x y z 是否有异常值";
private readonly string xColumn;
private readonly string yColumn;
private readonly string zColumn;
public MCDValidationRule(string xColumn, string yColumn, string zColumn)
{
this.xColumn = xColumn;
this.yColumn = yColumn;
this.zColumn = zColumn;
}
ValidationResult IValidationRule.Validate(DataTable dataTable)
{
if (!ValidateRequiredColumns(dataTable))
{
var result = new ValidationResult();
result.AddError(-1, "x y z 列都不能为空");
return result;
}
var dataMatrix = BuildDataMatrix(dataTable);
Debug.Assert(dataMatrix.RowCount >= 3);
var calculator = new RobustMahalanobis(numRandomStarts: 100);
var distances = calculator.Calculate(dataMatrix);
var outliers = IdentifyOutliers(distances);
return BuildValidationResult(outliers, distances);
}
/// <summary>
/// 验证必需的列是否存在
/// </summary>
private bool ValidateRequiredColumns(DataTable dataTable)
{
return dataTable.Columns.Contains(xColumn)
&& dataTable.Columns.Contains(yColumn)
&& dataTable.Columns.Contains(zColumn);
}
bool IsValidRow(DataRow row)
{
var columns = new[] { xColumn, yColumn, zColumn };
foreach (var col in columns)
{
var value = row[col];
if (value == DBNull.Value)
{
return false;
}
if (!double.TryParse(value.ToString(), out _))
{
return false;
}
}
return true;
}
/// <summary>
/// 构建数据矩阵
/// </summary>
private Matrix<double> BuildDataMatrix(DataTable dataTable)
{
var columns = new[] { xColumn, yColumn, zColumn };
// 确定有效列
var validRows = dataTable.AsEnumerable()
.Where(IsValidRow)
.ToList();
if (validRows.Count == 0)
{
return DenseMatrix.Create(0, columns.Length, 0.0);
}
var matrix = DenseMatrix.Create(validRows.Count, columns.Length, 0.0);
for (int i = 0; i < validRows.Count; i++)
{
for (int j = 0; j < columns.Length; j++)
{
matrix[i, j] = Convert.ToDouble(validRows[i][columns[j]]);
}
}
return matrix;
}
private int[] IdentifyOutliers(double[] distances)
{
if (distances.Length == 0)
{
return Array.Empty<int>();
}
double threshold = Math.Sqrt(7.81);
var outliers = distances
.Select((d, idx) => new { d, idx })
.Where(x => x.d > threshold)
.Select(x => x.idx)
.ToArray();
return outliers;
}
private ValidationResult BuildValidationResult(int[] outlierIndices, double[] distances)
{
var result = new ValidationResult();
for (int i = 0; i < outlierIndices.Length; i++)
{
result.AddError(outlierIndices[i], $"行 {i} 为异常值");
}
return result;
}
public ValidationResult Validate(string[] headers, int rowIndex, object[] values)
{
throw new NotImplementedException();
}
}
}