Die don’t remember the last roll. Really. [General Sta­tis­tics]

posted by Helmut Homepage – Vienna, Austria, 2020-06-28 13:35  – Posting: # 21583
Views: 1,830

Hi ElMaestro,

» what is statistical independence in layman's terms??

See the subject line.

» For example: [etc. etc.]

Fire up R and execute the script at the end.

There are samples with identical means but different variances and others with (almost) identical variances but different means. However, acc. to the [image] central limit theorem (CLT) pooled parameters will sooner or later approach the population’s ones.

» […] any perturbation on the data that changes Z will (may) also change s […]

Not sure what you mean by “perturbation on the data”. [image] Sample is a sample is a sample.
Or are you thinking about something happening after sampling (e.g., transcription errors, manipulations, …)? If you have a single sample and the population’s parameters are – as usual – unknown, you enter the slippery ground of outliers – which is a mixture of assessing (deviations from) assumptions, knowledge of the data generating process (incl. checking for implausible values), :blahblah:.

» Muchas gracias.

De nada.


PS: Comparing variances is a daring feat. In the example the ratio of largest/smallest sample mean is 1.18, whereas variances varied [pun] almost fourfold. This explains why outlier tests are not powerful and should be taken with a grain of salt.

set.seed(123456)
# Population
N        <- 1e5
mu       <- 100
s        <- 20
# Sampling
n        <- 20 # Sample size
samples  <- 30 # Number of samples
# Simulate population
x        <- rnorm(N, mean = mu, sd = s)
summary(x) # Show what we have
mean0    <- mean(x)
sd0      <- sd(x)
# Draw samples
xs       <- data.frame(sample = rep(1:samples, each = n),
                       xs = sample(x, samples*n))
# Sample estimates
mean.spl <- aggregate(xs[, 2], list(sample = xs$sample), mean)
var.spl  <- aggregate(xs[, 2], list(sample = xs$sample), var)
sd.spl   <- aggregate(xs[, 2], list(sample = xs$sample), sd)
spl      <- data.frame(sample = 1:samples, mean = mean.spl$x, var = var.spl$x)
# Pool estimates
mean.pld <- mean(mean.spl$x)
var.pld  <- sum((n-1)*var.spl$x)/(n*samples-samples)
sd.pld   <- sqrt(var.pld)
idx.m    <- sort(mean.spl$x, index = TRUE)$ix # index of sorted sample means
idx.v    <- sort(var.spl$x, index = TRUE)$ix  # index of sorted sample variances
# 3 plots with recording
col      <- rainbow(samples, start = 0.7, end = 0.1)
h.pop    <- hist(x, breaks = "FD", plot = FALSE)
xlim     <- range(h.pop$mids)
ylim     <- range(h.pop$density)*1.5
windows(record = TRUE)
op       <- par(no.readonly = TRUE)
par(mar = c(4, 4, 2, 0) + 0.1, ask = TRUE)
# 1: Population
plot(h.pop, freq = FALSE, xlim = xlim, ylim = ylim, col = "bisque",
     main = "Population", cex.main = 1, font.main = 1,
     xlab = paste("N  = ", N), cex.lab = 0.9, las = 1, border = FALSE)
abline(v = mean0, lty = 2, col = "blue"); box()
curve(dnorm(x, mean = mean0, sd = sd0), n = 501, col = "blue", lwd = 2, add = TRUE)
lines(mean0+c(-1, +1)*sd0, rep(dnorm(x = mean0-sd0, mean = mean0, sd = sd0), 2),
      lty = 3, col = "blue")
par(family = "mono")
legend("topright", bty = "n", col = "blue", lwd = 2,
       legend = sprintf("%5.1f | %5.1f (%3.1f%%)", mean0, sd0^2, 100*sd0/mean0),
       cex = 0.85, title = "mean | var (CV)")
# 2: Samples
par(family = "sans")
plot(xlim, ylim, type = "n", xlim = xlim, ylim = ylim,
     main = "Samples drawn from population (sorted by mean)", cex.main = 1,
     font.main = 1, xlab = paste0(samples, " samples (each n = ", n, ")"),
     ylab = "Density", cex.lab = 0.9, las = 1, frame.plot = TRUE)
par(family = "mono")
legend("topleft", box.lty = 0, lwd = 1, bg = "white", col = col[idx.m], cex = 0.75,
       legend = sprintf("%2i", mean.spl$sample[idx.m]), title = "sample")
legend("topright", box.lty = 0, lwd = NA, bg = "white", col = col[idx.m], cex = 0.75,
       legend = sprintf("%5.1f | %5.1f (%3.1f%%)",
                        mean.spl$x[idx.m], var.spl$x[idx.m],
                        100*sqrt(var.spl$x[idx.m])/mean.spl$x[idx.m]),
       title = "mean | var (CV)"); box()
for (j in seq_along(idx.m)) {
  curve(dnorm(x, mean = mean.spl[j, 2], sd = sd.spl[j, 2]), n = 501,
        col = col[j], add = TRUE)
  lines(mean.spl[j, 2]+c(-1, +1)*sd.spl[j, 2],
        rep(dnorm(x = mean.spl[j, 2]-sd.spl[j, 2],
                  mean = mean.spl[j, 2], sd = sd.spl[j, 2]), 2),
        lty = 3, col = col[j])
}
# 3: Population and pooled samples
par(family = "sans")
plot(xlim, ylim, type = "n", xlim = xlim, ylim = ylim,
     main = "Population and pooled samples", cex.main = 1, font.main = 1,
     xlab = paste0("Estimated from ", samples, " pooled samples (each n  = ",
                   n, ")"), ylab = "Density", cex.lab = 0.9, las = 1)
abline(v = mean0, lty = 2, col = "blue")
abline(v = mean.pld, lty = 2, col = col[floor(samples/2)]); box()
curve(dnorm(x, mean = mean0, sd = sd0), n = 501, lwd = 2,
      col = "blue", add = TRUE)
lines(mean0+c(-1, +1)*sd0, rep(dnorm(x = mean0-sd0, mean = mean0, sd = sd0), 2),
      lty = 3, col = "blue")
curve(dnorm(x, mean = mean.pld, sd = sqrt(var.pld)), n = 501, lwd = 2,
      col = col[floor(samples/2)], add = TRUE)
lines(mean.pld+c(-1, +1)*sqrt(var.pld),
      rep(dnorm(x = mean.pld-sqrt(var.pld), mean = mean.pld, sd = sqrt(var.pld)), 2),
      lty = 3, col = col[floor(samples/2)])
par(family = "mono")
legend("topright", box.lty = 0, lwd = 2, col = c("blue", col[floor(samples/2)]),
       legend = c(sprintf("Population    : %5.1f | %5.1f (%3.1f%%)",
                          mean0, sd0^2, 100*sd0/mean0),
                  sprintf("Pooled samples: %5.1f | %5.1f (%3.1f%%)",
                          mean.pld, var.pld, 100*sqrt(var.pld)/mean.pld)),
       cex = 0.85, title = "mean | var (CV)", bg = "white"); box()
# 4: Is there a relationship between samples' means and variances?
par(family = "sans")
plot(spl$mean, spl$var, type = "n", las = 1,
     main = paste0("Sample estimates (correlation = ",
                   signif(cor(spl$mean, spl$var), 5), ")"),
     cex.main = 1, font.main = 1, xlab = "mean", ylab = "variance")
abline(h = var.pld, lty = 3, col = "lightgrey")
abline(v = mean.pld, lty = 3, col = "lightgrey")
abline(lsfit(spl$mean, spl$var), lty = 2, col = "darkgrey"); box()
points(mean.spl$x[idx.m], var.spl$x[idx.m], pch = 16, col = col[idx.m], cex = 1.35)
for (j in 1:nrow(spl)) {
  ifelse (spl$var[j] >= var.pld, loc <- 1, loc <- 3)
  text(spl$mean[j], spl$var[j], labels = spl$sample[j], cex = 0.6, pos = loc)
}
par(op)
# Sample estimates ordered by mean
print(spl[idx.m, ], row.names = FALSE)
# Sample estimates ordered by variance
print(spl[idx.v, ], row.names = FALSE)
# Ratio of extreme sample means
spl[idx.m, "mean"][samples]/spl[idx.m, "mean"][1]
# Ratio of extreme sample variances
spl[idx.v, "var"][samples]/spl[idx.v, "var"][1]
# %RE of sample means and variances
summary(100*(mean.spl$x-mu)/mu); summary(100*(sd.spl$x^2-s^2)/s^2)
# Correlation of sample means and variances
cor(spl$mean, spl$var)


Dif-tor heh smusma 🖖
Helmut Schütz
[image]

The quality of responses received is directly proportional to the quality of the question asked. 🚮
Science Quotes

Complete thread:

Activity
 Admin contact
20,817 posts in 4,356 threads, 1,447 registered users;
online 10 (0 registered, 10 guests [including 8 identified bots]).
Forum time: 19:35 CEST (Europe/Vienna)

Repetition does not transform a lie into a truth.    Franklin D. Roosevelt

The Bioequivalence and Bioavailability Forum is hosted by
BEBAC Ing. Helmut Schütz
HTML5