## Die don’t remember the last roll. Really. [General Sta­tis­tics]

Hi ElMaestro,

» what is statistical independence in layman's terms??

See the subject line.

» For example: [etc. etc.]

Fire up R and execute the script at the end.

There are samples with identical means but different variances and others with (almost) identical variances but different means. However, acc. to the central limit theorem (CLT) pooled parameters will sooner or later approach the population’s ones.

» […] any perturbation on the data that changes Z will (may) also change s […]

Not sure what you mean by “perturbation on the data”. Sample is a sample is a sample.
Or are you thinking about something happening after sampling (e.g., transcription errors, manipulations, …)? If you have a single sample and the population’s parameters are – as usual – unknown, you enter the slippery ground of outliers – which is a mixture of assessing (deviations from) assumptions, knowledge of the data generating process (incl. checking for implausible values), .

» Muchas gracias.

PS: Comparing variances is a daring feat. In the example the ratio of largest/smallest sample mean is 1.18, whereas variances varied [pun] almost fourfold. This explains why outlier tests are not powerful and should be taken with a grain of salt.

set.seed(123456)
# Population
N        <- 1e5
mu       <- 100
s        <- 20
# Sampling
n        <- 20 # Sample size
samples  <- 30 # Number of samples
# Simulate population
x        <- rnorm(N, mean = mu, sd = s)
summary(x) # Show what we have
mean0    <- mean(x)
sd0      <- sd(x)
# Draw samples
xs       <- data.frame(sample = rep(1:samples, each = n),
xs = sample(x, samples*n))
# Sample estimates
mean.spl <- aggregate(xs[, 2], list(sample = xs$sample), mean) var.spl <- aggregate(xs[, 2], list(sample = xs$sample), var)
sd.spl   <- aggregate(xs[, 2], list(sample = xs$sample), sd) spl <- data.frame(sample = 1:samples, mean = mean.spl$x, var = var.spl$x) # Pool estimates mean.pld <- mean(mean.spl$x)
var.pld  <- sum((n-1)*var.spl$x)/(n*samples-samples) sd.pld <- sqrt(var.pld) idx.m <- sort(mean.spl$x, index = TRUE)$ix # index of sorted sample means idx.v <- sort(var.spl$x, index = TRUE)$ix # index of sorted sample variances # 3 plots with recording col <- rainbow(samples, start = 0.7, end = 0.1) h.pop <- hist(x, breaks = "FD", plot = FALSE) xlim <- range(h.pop$mids)
ylim     <- range(h.pop$density)*1.5 windows(record = TRUE) op <- par(no.readonly = TRUE) par(mar = c(4, 4, 2, 0) + 0.1, ask = TRUE) # 1: Population plot(h.pop, freq = FALSE, xlim = xlim, ylim = ylim, col = "bisque", main = "Population", cex.main = 1, font.main = 1, xlab = paste("N = ", N), cex.lab = 0.9, las = 1, border = FALSE) abline(v = mean0, lty = 2, col = "blue"); box() curve(dnorm(x, mean = mean0, sd = sd0), n = 501, col = "blue", lwd = 2, add = TRUE) lines(mean0+c(-1, +1)*sd0, rep(dnorm(x = mean0-sd0, mean = mean0, sd = sd0), 2), lty = 3, col = "blue") par(family = "mono") legend("topright", bty = "n", col = "blue", lwd = 2, legend = sprintf("%5.1f | %5.1f (%3.1f%%)", mean0, sd0^2, 100*sd0/mean0), cex = 0.85, title = "mean | var (CV)") # 2: Samples par(family = "sans") plot(xlim, ylim, type = "n", xlim = xlim, ylim = ylim, main = "Samples drawn from population (sorted by mean)", cex.main = 1, font.main = 1, xlab = paste0(samples, " samples (each n = ", n, ")"), ylab = "Density", cex.lab = 0.9, las = 1, frame.plot = TRUE) par(family = "mono") legend("topleft", box.lty = 0, lwd = 1, bg = "white", col = col[idx.m], cex = 0.75, legend = sprintf("%2i", mean.spl$sample[idx.m]), title = "sample")
legend("topright", box.lty = 0, lwd = NA, bg = "white", col = col[idx.m], cex = 0.75,
legend = sprintf("%5.1f | %5.1f (%3.1f%%)",
mean.spl$x[idx.m], var.spl$x[idx.m],
100*sqrt(var.spl$x[idx.m])/mean.spl$x[idx.m]),
title = "mean | var (CV)"); box()
for (j in seq_along(idx.m)) {
curve(dnorm(x, mean = mean.spl[j, 2], sd = sd.spl[j, 2]), n = 501,
col = col[j], add = TRUE)
lines(mean.spl[j, 2]+c(-1, +1)*sd.spl[j, 2],
rep(dnorm(x = mean.spl[j, 2]-sd.spl[j, 2],
mean = mean.spl[j, 2], sd = sd.spl[j, 2]), 2),
lty = 3, col = col[j])
}
# 3: Population and pooled samples
par(family = "sans")
plot(xlim, ylim, type = "n", xlim = xlim, ylim = ylim,
main = "Population and pooled samples", cex.main = 1, font.main = 1,
xlab = paste0("Estimated from ", samples, " pooled samples (each n  = ",
n, ")"), ylab = "Density", cex.lab = 0.9, las = 1)
abline(v = mean0, lty = 2, col = "blue")
abline(v = mean.pld, lty = 2, col = col[floor(samples/2)]); box()
curve(dnorm(x, mean = mean0, sd = sd0), n = 501, lwd = 2,
col = "blue", add = TRUE)
lines(mean0+c(-1, +1)*sd0, rep(dnorm(x = mean0-sd0, mean = mean0, sd = sd0), 2),
lty = 3, col = "blue")
curve(dnorm(x, mean = mean.pld, sd = sqrt(var.pld)), n = 501, lwd = 2,
col = col[floor(samples/2)], add = TRUE)
lines(mean.pld+c(-1, +1)*sqrt(var.pld),
rep(dnorm(x = mean.pld-sqrt(var.pld), mean = mean.pld, sd = sqrt(var.pld)), 2),
lty = 3, col = col[floor(samples/2)])
par(family = "mono")
legend("topright", box.lty = 0, lwd = 2, col = c("blue", col[floor(samples/2)]),
legend = c(sprintf("Population    : %5.1f | %5.1f (%3.1f%%)",
mean0, sd0^2, 100*sd0/mean0),
sprintf("Pooled samples: %5.1f | %5.1f (%3.1f%%)",
mean.pld, var.pld, 100*sqrt(var.pld)/mean.pld)),
cex = 0.85, title = "mean | var (CV)", bg = "white"); box()
# 4: Is there a relationship between samples' means and variances?
par(family = "sans")
plot(spl$mean, spl$var, type = "n", las = 1,
main = paste0("Sample estimates (correlation = ",
signif(cor(spl$mean, spl$var), 5), ")"),
cex.main = 1, font.main = 1, xlab = "mean", ylab = "variance")
abline(h = var.pld, lty = 3, col = "lightgrey")
abline(v = mean.pld, lty = 3, col = "lightgrey")
abline(lsfit(spl$mean, spl$var), lty = 2, col = "darkgrey"); box()
points(mean.spl$x[idx.m], var.spl$x[idx.m], pch = 16, col = col[idx.m], cex = 1.35)
for (j in 1:nrow(spl)) {
ifelse (spl$var[j] >= var.pld, loc <- 1, loc <- 3) text(spl$mean[j], spl$var[j], labels = spl$sample[j], cex = 0.6, pos = loc)
}
par(op)
# Sample estimates ordered by mean
print(spl[idx.m, ], row.names = FALSE)
# Sample estimates ordered by variance
print(spl[idx.v, ], row.names = FALSE)
# Ratio of extreme sample means
spl[idx.m, "mean"][samples]/spl[idx.m, "mean"][1]
# Ratio of extreme sample variances
spl[idx.v, "var"][samples]/spl[idx.v, "var"][1]
# %RE of sample means and variances
summary(100*(mean.spl$x-mu)/mu); summary(100*(sd.spl$x^2-s^2)/s^2)
# Correlation of sample means and variances
cor(spl$mean, spl$var)

Dif-tor heh smusma 🖖
Helmut Schütz

The quality of responses received is directly proportional to the quality of the question asked. 🚮
Science Quotes