2020年新年大赛！

Question

GralL

Asked:2020-03-17 03:02:39 +0000 UTC2020-03-17 03:02:39 +0000 UTC 2020-03-17 03:02:39 +0000 UTC

检测并排除数据框中的异常值 [关闭]

772

你能告诉我如何去除数据中的异常值吗？

1 个回答

Voted

maestro · Answer 1 · 2020-03-17T16:39:12Z

有几种方法可以搜索数据异常值。如果样本是正态分布的，上述所有方法都将起作用。在所有示例中，该函数将返回其中异常值被 NAN 替换的原始数组、原始样本和结果样本的统计特征以及异常值的位置编号。在所有示例中，alpha是显着性水平。

最简单的方法是基于计算平均值和标准差。在缩放样本以使其平均值为零且偏差为一之后，您可以选择那些与零相差正负 alpha 的元素。他们将是异常值。

def Maxstd(df, alpha=0.97):
    X = df
    Zero = pd.DataFrame(0, index=X.index, columns=X.columns)
    numbers = np.empty([0])
    ResArray = X.copy()
    OperatingArray = X.copy()

    while True:
        m = OperatingArray.mean()
        s = OperatingArray.std()
        XX = (OperatingArray - m) / s
        L = paired_distances(XX, Zero)
        maxindex = np.argmax(L)
        XX.ix[maxindex] = np.nan
        ss = XX.std()
        if ss.min() >= alpha:
            break;
        numbers = np.append(numbers, maxindex)
        OperatingArray.ix[maxindex] = m

    ResArray.ix[numbers] = np.nan
    Result = pd.DataFrame(columns=X.columns)
    Result = Result.append(X.mean(), ignore_index=True)
    Result = Result.append(ResArray.mean(), ignore_index=True)
    Result = Result.append(X.std(), ignore_index=True)
    Result = Result.append(ResArray.std(), ignore_index=True)
    Result = Result.append(X.min(), ignore_index=True)
    Result = Result.append(ResArray.min(), ignore_index=True)
    Result = Result.append(X.max(), ignore_index=True)
    Result = Result.append(ResArray.max(), ignore_index=True)

    Result.index = ['Mean (before)', 'Mean (after)', 'Standard deviation (before)', 'Standard deviation (after)', 'Minimum (before)',
        'Minimum(after)', 'Maximum (before)', 'Maximum(after)']
    return ResArray, Result, numbers

另一种方法是基于计算样本均值和样本中每个向量之间的度量。该示例使用 Mahalanobis 度量，但您可以选择适合您的样本的度量。

def Mahalanobis(df, alpha=0.9):
    X = df
    mean = X.mean()
    L = pd.DataFrame(pairwise_distances(X, mean.reshape(1, -1), metric='mahalanobis'))
    upper_bound = L.values.max() * alpha

    numbers = L.loc[L[0] > upper_bound]
    XX = X.copy()
    XX.ix[numbers.index.values] = np.nan

    Result = pd.DataFrame(columns=X.columns)
    Result = Result.append(X.mean(), ignore_index=True)
    Result = Result.append(XX.mean(), ignore_index=True)
    Result = Result.append(X.std(), ignore_index=True)
    Result = Result.append(XX.std(), ignore_index=True)
    Result = Result.append(X.min(), ignore_index=True)
    Result = Result.append(XX.min(), ignore_index=True)
    Result = Result.append(X.max(), ignore_index=True)
    Result = Result.append(XX.max(), ignore_index=True)

    Result.index = ['Mean (before)', 'Mean (after)', 'Standard deviation (before)', 'Standard deviation (after)', 'Minimum (before)',
        'Minimum(after)', 'Maximum (before)', 'Maximum(after)']
    return XX, Result, numbers.index.values

在重采样半均值（RHM）方法中，原始样本中所有向量的一半被随机选择几次。对于选定的向量，计算平均值和标准偏差。整个原始样本根据计算的平均值和标准偏差进行缩放。对于每个样本向量，计算向量与零之间的距离。列出所有计算的距离以供进一步处理。

def RHM(df, alpha=0.9):
    X = df
    L = pd.DataFrame()
    Zero = pd.DataFrame(0, index=np.arange(X.shape[0]), columns=X.columns)

    for i in range(X.shape[0] * 2):
        #Random sampling
        rows = random.sample(X.index, X.shape[0] / 2)
        Xsami = X.ix[rows]

        #Calculate mean and standard deviation
        mi = Xsami.mean()
        si = Xsami.std()
        #Scale data frame
        Xi = (X - mi) / si

        #Calculate vectors length
        Li = paired_distances(Xi, Zero)
        L['X'+str(i)] = Li

    upperBound = L.values.max() * alpha
    counts = L.gt(upperBound).sum(axis=1)
    numbers = counts.loc[counts > 0]
    XX = X.copy()
    XX.ix[numbers.index.values] = np.nan

    Result = pd.DataFrame(columns=X.columns)
    Result = Result.append(X.mean(), ignore_index=True)
    Result = Result.append(XX.mean(), ignore_index=True)
    Result = Result.append(X.std(), ignore_index=True)
    Result = Result.append(XX.std(), ignore_index=True)
    Result = Result.append(X.min(), ignore_index=True)
    Result = Result.append(XX.min(), ignore_index=True)
    Result = Result.append(X.max(), ignore_index=True)
    Result = Result.append(XX.max(), ignore_index=True)

    Result.index = ['Mean (before)', 'Mean (after)', 'Standard deviation (before)', 'Standard deviation (after)', 'Minimum (before)',
        'Minimum(after)', 'Maximum (before)', 'Maximum(after)']
    return XX, Result, numbers.index.values

如果我们谈论的不是正态分布的样本，而是实时变化的数据，那么任务就会变得更加复杂。在这种情况下，我们可以推荐使用支持向量机，其属性将是第 i 个和第 (i+1) 个元素之间的差异向量，第 i 个和 (i+2)- th等我编写此代码是为了确定样本中是否存在噪音，但您可以对其进行修改以满足您的需要。

def find_minmax(column):
    coldif1 = np.array(0)
    for i in range(0, column.size - 1):
        coldif1 = np.append(coldif1, np.absolute(column[i] - column[i + 1]))
    coldif1 = np.delete(coldif1, 0, 0)

    coldif2 = np.array(0)
    for i in range(0, column.size - 2):
        coldif2 = np.append(coldif2, np.absolute(column[i] - column[i + 2]))
    coldif2 = np.delete(coldif2, 0, 0)

    coldif3 = np.array(0)
    for i in range(0, column.size - 3):
        coldif3 = np.append(coldif3, np.absolute(column[i] - column[i + 3]))
    coldif3 = np.delete(coldif3, 0, 0)

    coldif4 = np.array(0)
    for i in range(0, column.size - 4):
        coldif4 = np.append(coldif4, np.absolute(column[i] - column[i + 4]))
    coldif4 = np.delete(coldif4, 0, 0)

    coldif5 = np.array(0)
    for i in range(0, column.size - 5):
        coldif5 = np.append(coldif5, np.absolute(column[i] - column[i + 5]))
    coldif5 = np.delete(coldif5, 0, 0)

    return [np.amax(coldif1), np.amax(coldif2), np.amax(coldif3), np.amax(coldif4), np.amax(coldif5)]

xls = pd.ExcelFile('noise_sensor.xlsx')
df = pd.read_excel(io=xls, sheet=0, header=0)
df = df.drop(['A', 'B'], axis=1)

X_train = np.array([np.zeros(5)])
for i in range(0, 16):
    column = df.as_matrix(columns=df.columns[i:i+1])
    #print(column)
    X_train = np.append(X_train, [find_minmax(column)], axis=0)

X_train = np.delete(X_train, 0, axis=0)
print(X_train)

Y_train = np.array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

clf = SVC()
clf.fit(X_train, Y_train)

test_df = pd.read_excel(io=xls, sheetname=2, header=0)
test_df = test_df.drop(['A', 'B'], axis=1)

X_test = np.array([np.zeros(5)])
for i in range(0, 16):
    column = test_df.as_matrix(columns=test_df.columns[i:i+1])
    X_test = np.append(X_test, [find_minmax(column)], axis=0)

X_test = np.delete(X_test, 0, axis=0)

print(clf.predict(X_test))

检测并排除数据框中的异常值 [关闭]

根据浏览器窗口的大小调整背景图案的大小

理解for循环的执行逻辑

复制动态数组时出错（C++）

Or and If,elif,else 构造[重复]

如何构建支持 x64 的 APK

如何使按钮的输入宽度？

如何显示对象变量的名称？

如何循环一个函数？

LOWORD 宏有什么作用？

从字符串的开头删除直到并包括一个字符

检测并排除数据框中的异常值 [关闭]

1 个回答

相关问题