Die don’t remember the last roll. Really. [General Sta­tis­tics]

posted by Helmut Homepage – Vienna, Austria, 2020-06-28 15:35 (1369 d 15:18 ago) – Posting: # 21583
Views: 4,636

Hi ElMaestro,

❝ what is statistical independence in layman's terms??


See the subject line.

❝ For example: [etc. etc.]



Fire up R and execute the script at the end.

There are samples with identical means but different variances and others with (almost) identical variances but different means. However, acc. to the central limit theorem (CLT) pooled parameters will sooner or later approach the population’s ones.

❝ […] any perturbation on the data that changes Z will (may) also change s […]



Not sure what you mean by “perturbation on the data”. Sample is a sample is a sample.
Or are you thinking about something happening after sampling (e.g., transcription errors, manipulations, …)? If you have a single sample and the population’s parameters are – as usual – unknown, you enter the slippery ground of outliers – which is a mixture of assessing (deviations from) assumptions, knowledge of the data generating process (incl. checking for implausible values), :blahblah:.

❝ Muchas gracias.


De nada.


PS: Comparing variances is a daring feat. In the example the ratio of largest/smallest sample mean is 1.18, whereas variances varied [pun] almost fourfold. This explains why outlier tests are not powerful and should be taken with a grain of salt.

set.seed(123456)
# Population
N        <- 1e5
mu       <- 100
s        <- 20
# Sampling
n        <- 20 # Sample size
samples  <- 30 # Number of samples
# Simulate population
x        <- rnorm(N, mean = mu, sd = s)
summary(x) # Show what we have
mean0    <- mean(x)
sd0      <- sd(x)
# Draw samples
xs       <- data.frame(sample = rep(1:samples, each = n),
                       xs = sample(x, samples*n))
# Sample estimates
mean.spl <- aggregate(xs[, 2], list(sample = xs$sample), mean)
var.spl  <- aggregate(xs[, 2], list(sample = xs$sample), var)
sd.spl   <- aggregate(xs[, 2], list(sample = xs$sample), sd)
spl      <- data.frame(sample = 1:samples, mean = mean.spl$x, var = var.spl$x)
# Pool estimates
mean.pld <- mean(mean.spl$x)
var.pld  <- sum((n-1)*var.spl$x)/(n*samples-samples)
sd.pld   <- sqrt(var.pld)
idx.m    <- sort(mean.spl$x, index = TRUE)$ix # index of sorted sample means
idx.v    <- sort(var.spl$x, index = TRUE)$ix  # index of sorted sample variances
# 3 plots with recording
col      <- rainbow(samples, start = 0.7, end = 0.1)
h.pop    <- hist(x, breaks = "FD", plot = FALSE)
xlim     <- range(h.pop$mids)
ylim     <- range(h.pop$density)*1.5
windows(record = TRUE)
op       <- par(no.readonly = TRUE)
par(mar = c(4, 4, 2, 0) + 0.1, ask = TRUE)
# 1: Population
plot(h.pop, freq = FALSE, xlim = xlim, ylim = ylim, col = "bisque",
     main = "Population", cex.main = 1, font.main = 1,
     xlab = paste("N  = ", N), cex.lab = 0.9, las = 1, border = FALSE)
abline(v = mean0, lty = 2, col = "blue"); box()
curve(dnorm(x, mean = mean0, sd = sd0), n = 501, col = "blue", lwd = 2, add = TRUE)
lines(mean0+c(-1, +1)*sd0, rep(dnorm(x = mean0-sd0, mean = mean0, sd = sd0), 2),
      lty = 3, col = "blue")
par(family = "mono")
legend("topright", bty = "n", col = "blue", lwd = 2,
       legend = sprintf("%5.1f | %5.1f (%3.1f%%)", mean0, sd0^2, 100*sd0/mean0),
       cex = 0.85, title = "mean | var (CV)")
# 2: Samples
par(family = "sans")
plot(xlim, ylim, type = "n", xlim = xlim, ylim = ylim,
     main = "Samples drawn from population (sorted by mean)", cex.main = 1,
     font.main = 1, xlab = paste0(samples, " samples (each n = ", n, ")"),
     ylab = "Density", cex.lab = 0.9, las = 1, frame.plot = TRUE)
par(family = "mono")
legend("topleft", box.lty = 0, lwd = 1, bg = "white", col = col[idx.m], cex = 0.75,
       legend = sprintf("%2i", mean.spl$sample[idx.m]), title = "sample")
legend("topright", box.lty = 0, lwd = NA, bg = "white", col = col[idx.m], cex = 0.75,
       legend = sprintf("%5.1f | %5.1f (%3.1f%%)",
                        mean.spl$x[idx.m], var.spl$x[idx.m],
                        100*sqrt(var.spl$x[idx.m])/mean.spl$x[idx.m]),
       title = "mean | var (CV)"); box()
for (j in seq_along(idx.m)) {
  curve(dnorm(x, mean = mean.spl[j, 2], sd = sd.spl[j, 2]), n = 501,
        col = col[j], add = TRUE)
  lines(mean.spl[j, 2]+c(-1, +1)*sd.spl[j, 2],
        rep(dnorm(x = mean.spl[j, 2]-sd.spl[j, 2],
                  mean = mean.spl[j, 2], sd = sd.spl[j, 2]), 2),
        lty = 3, col = col[j])
}
# 3: Population and pooled samples
par(family = "sans")
plot(xlim, ylim, type = "n", xlim = xlim, ylim = ylim,
     main = "Population and pooled samples", cex.main = 1, font.main = 1,
     xlab = paste0("Estimated from ", samples, " pooled samples (each n  = ",
                   n, ")"), ylab = "Density", cex.lab = 0.9, las = 1)
abline(v = mean0, lty = 2, col = "blue")
abline(v = mean.pld, lty = 2, col = col[floor(samples/2)]); box()
curve(dnorm(x, mean = mean0, sd = sd0), n = 501, lwd = 2,
      col = "blue", add = TRUE)
lines(mean0+c(-1, +1)*sd0, rep(dnorm(x = mean0-sd0, mean = mean0, sd = sd0), 2),
      lty = 3, col = "blue")
curve(dnorm(x, mean = mean.pld, sd = sqrt(var.pld)), n = 501, lwd = 2,
      col = col[floor(samples/2)], add = TRUE)
lines(mean.pld+c(-1, +1)*sqrt(var.pld),
      rep(dnorm(x = mean.pld-sqrt(var.pld), mean = mean.pld, sd = sqrt(var.pld)), 2),
      lty = 3, col = col[floor(samples/2)])
par(family = "mono")
legend("topright", box.lty = 0, lwd = 2, col = c("blue", col[floor(samples/2)]),
       legend = c(sprintf("Population    : %5.1f | %5.1f (%3.1f%%)",
                          mean0, sd0^2, 100*sd0/mean0),
                  sprintf("Pooled samples: %5.1f | %5.1f (%3.1f%%)",
                          mean.pld, var.pld, 100*sqrt(var.pld)/mean.pld)),
       cex = 0.85, title = "mean | var (CV)", bg = "white"); box()
# 4: Is there a relationship between samples' means and variances?
par(family = "sans")
plot(spl$mean, spl$var, type = "n", las = 1,
     main = paste0("Sample estimates (correlation = ",
                   signif(cor(spl$mean, spl$var), 5), ")"),
     cex.main = 1, font.main = 1, xlab = "mean", ylab = "variance")
abline(h = var.pld, lty = 3, col = "lightgrey")
abline(v = mean.pld, lty = 3, col = "lightgrey")
abline(lsfit(spl$mean, spl$var), lty = 2, col = "darkgrey"); box()
points(mean.spl$x[idx.m], var.spl$x[idx.m], pch = 16, col = col[idx.m], cex = 1.35)
for (j in 1:nrow(spl)) {
  ifelse (spl$var[j] >= var.pld, loc <- 1, loc <- 3)
  text(spl$mean[j], spl$var[j], labels = spl$sample[j], cex = 0.6, pos = loc)
}
par(op)
# Sample estimates ordered by mean
print(spl[idx.m, ], row.names = FALSE)
# Sample estimates ordered by variance
print(spl[idx.v, ], row.names = FALSE)
# Ratio of extreme sample means
spl[idx.m, "mean"][samples]/spl[idx.m, "mean"][1]
# Ratio of extreme sample variances
spl[idx.v, "var"][samples]/spl[idx.v, "var"][1]
# %RE of sample means and variances
summary(100*(mean.spl$x-mu)/mu); summary(100*(sd.spl$x^2-s^2)/s^2)
# Correlation of sample means and variances
cor(spl$mean, spl$var)


Dif-tor heh smusma 🖖🏼 Довге життя Україна! [image]
Helmut Schütz
[image]

The quality of responses received is directly proportional to the quality of the question asked. 🚮
Science Quotes

Complete thread:

UA Flag
Activity
 Admin contact
22,957 posts in 4,819 threads, 1,636 registered users;
81 visitors (0 registered, 81 guests [including 8 identified bots]).
Forum time: 05:53 CET (Europe/Vienna)

With four parameters I can fit an elephant,
and with five I can make him wiggle his trunk.    John von Neumann

The Bioequivalence and Bioavailability Forum is hosted by
BEBAC Ing. Helmut Schütz
HTML5