The mean and standard deviation of a sample of data can be thrown off if the sample contains one or many outlier(s) :
For this reason, it is usually a good idea to check for and remove outliers before computing the mean or the standard deviation of a sample. To this aim, your function will receive a list of numbers representing a sample of data. Your function must remove any outliers and return the mean of the sample, rounded to two decimal places (round only at the end).
Since there is no objective definition of “outlier” in statistics, your function will also receive a cutoff, in standard deviation units. So for example if the cutoff is 3, then any value that is more than 3 standard deviations above or below the mean must be removed. Notice that, once outlying values are removed in a first “sweep”, other less extreme values may then “become” outliers, that you’ll have to remove as well!
sample = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]
cutoff = 3
clean_mean(sample, cutoff) → 5.5
Note : since we are not computing the sample standard deviation for inferential purposes, the denominator is n, not n - 1.
using Statistics, Test
function get_outlier(sample::Vector{Int64}, cutoff::Real)::Vector{Int64}
= sample
xs = mean(sample)
x_bar = length(sample)
n = √(n \ sum((x - x_bar)^2 for x in xs))
sd
is_outlier(x::Real)::Bool = abs(x - x_bar) > cutoff * sd
return filter(is_outlier, xs)
end
#> get_outlier (generic function with 1 method)
@test get_outlier([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 10000], 3) == [10000]
#> Test Passed
#> Expression: get_outlier([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 10000], 3) == [10000]
#> Evaluated: [10000] == [10000]
"""
去掉所有离群点后,求均值
"""
#> "去掉所有离群点后,求均值\n"
function clean_mean(sample::Vector{Int64}, cutoff::Real)::Float64
while true
= get_outlier(sample, cutoff)
outliers isempty(outliers) && return mean(sample)
# sample = sample[sample.∉outliers] # []选择器
# sample = filter(x -> x ∉ outliers, sample)
= filter(∉(outliers), sample)
sample end
end
#> clean_mean (generic function with 1 method)
clean_mean([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 10000], 3)
#> 5.5
## Mean without outliers
library(tidyverse)
<- function(x, mean, std, cutoff) {
not_outlier abs(x - mean) <= cutoff * std
}
#' @title calculate mean of a sample without outliers
#' @param sam a sample
#' @param cutoff 评估一个样本点是否为 outlier 的参数
#' 当一个样本点的离差大于标准差的 cutoff 倍时,认为它是 outlier
<- function(sample, cutoff) {
clean_mean while (TRUE) {
# 无限循环,满足条件时用 return() 跳出
<- mean(sample)
m <- length(sample)
n <- sqrt((1/n) * sum((sample - m)^2))
sd <- sample %>%
sample_without_outlier keep(~not_outlier(.x, m, sd, cutoff))
if (length(sample_without_outlier) == n) {
return(m)
else {
} <- sample_without_outlier
sample
}
}
}
library(testthat)
test_that("Example Tests", {
<- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100)
sam <- 3
cutoff expect_equal(clean_mean(sam, cutoff), 5.5)
})
#> Test passed 🌈
const Statistics = require("../src/JavaScript/toolkit/Statistics")
const sum = Statistics.sum;
const mean = Statistics.mean;
function notOutlier(x, mean, std, cutoff) {
return Math.abs(x - mean) <= cutoff * std;
}
function cleanMean(sample, cutoff) {
let m, n, std;
let sample_without_outlier = [];
while (true) {
= mean(sample);
m = sample.length;
n = Math.sqrt(sum(sample.map(x => Math.pow(x - m, 2))) / n);
sd = sample.filter(x => notOutlier(x, m, sd, cutoff));
sample_without_outlier
if (sample_without_outlier.length === n) {
return m;
else {
} = sample_without_outlier;
sample
}
}
}
console.log(cleanMean([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100], 3));
#> 5.5