Mean-without-outliers

Description

The mean and standard deviation of a sample of data can be thrown off if the sample contains one or many outlier(s) :

For this reason, it is usually a good idea to check for and remove outliers before computing the mean or the standard deviation of a sample. To this aim, your function will receive a list of numbers representing a sample of data. Your function must remove any outliers and return the mean of the sample, rounded to two decimal places (round only at the end).

Since there is no objective definition of “outlier” in statistics, your function will also receive a cutoff, in standard deviation units. So for example if the cutoff is 3, then any value that is more than 3 standard deviations above or below the mean must be removed. Notice that, once outlying values are removed in a first “sweep”, other less extreme values may then “become” outliers, that you’ll have to remove as well!

Example

sample = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]
cutoff = 3
clean_mean(sample, cutoff) → 5.5

Note : since we are not computing the sample standard deviation for inferential purposes, the denominator is n, not n - 1.

Solutions

Julia

using Statistics, Test


function get_outlier(sample::Vector{Int64}, cutoff::Real)::Vector{Int64}
    xs = sample
    x_bar = mean(sample)
    n = length(sample)
    sd = √(n \ sum((x - x_bar)^2 for x in xs))

    is_outlier(x::Real)::Bool = abs(x - x_bar) > cutoff * sd

    return filter(is_outlier, xs)
end

#> get_outlier (generic function with 1 method)


@test get_outlier([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 10000], 3) == [10000]

#> Test Passed
#>   Expression: get_outlier([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 10000], 3) == [10000]
#>    Evaluated: [10000] == [10000]



"""
去掉所有离群点后，求均值
"""

#> "去掉所有离群点后，求均值\n"

function clean_mean(sample::Vector{Int64}, cutoff::Real)::Float64
    while true
        outliers = get_outlier(sample, cutoff)
        isempty(outliers) && return mean(sample)
        # sample = sample[sample.∉outliers] # []选择器
        # sample = filter(x -> x ∉ outliers, sample)
        sample = filter(∉(outliers), sample)
    end
end

#> clean_mean (generic function with 1 method)


clean_mean([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 10000], 3)

#> 5.5

R

## Mean without outliers

library(tidyverse)


not_outlier <- function(x, mean, std, cutoff) {
    abs(x - mean) <= cutoff * std
}


#' @title calculate mean of a sample without outliers
#' @param sam a sample
#' @param cutoff 评估一个样本点是否为 outlier 的参数
#'    当一个样本点的离差大于标准差的 cutoff 倍时，认为它是 outlier
clean_mean <- function(sample, cutoff) {
    while (TRUE) {
        # 无限循环，满足条件时用 return() 跳出
        m <- mean(sample)
        n <- length(sample)
        sd <- sqrt((1/n) * sum((sample - m)^2))
        sample_without_outlier <- sample %>%
            keep(~not_outlier(.x, m, sd, cutoff))

        if (length(sample_without_outlier) == n) {
            return(m)
        } else {
            sample <- sample_without_outlier
        }
    }
}


library(testthat)
test_that("Example Tests", {
    sam <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100)
    cutoff <- 3
    expect_equal(clean_mean(sam, cutoff), 5.5)
})

#> Test passed 🌈

JavaScript

const Statistics = require("../src/JavaScript/toolkit/Statistics")
const sum = Statistics.sum;
const mean = Statistics.mean;


function notOutlier(x, mean, std, cutoff) {
  return Math.abs(x - mean) <= cutoff * std;
}

function cleanMean(sample, cutoff) {
  let m, n, std;
  let sample_without_outlier = [];
  while (true) {
    m = mean(sample);
    n = sample.length;
    sd = Math.sqrt(sum(sample.map(x => Math.pow(x - m, 2))) / n);
    sample_without_outlier = sample.filter(x => notOutlier(x, m, sd, cutoff));

    if (sample_without_outlier.length === n) {
      return m;
    } else {
      sample = sample_without_outlier;
    }
  }
}


console.log(cleanMean([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100], 3));

#> 5.5