## just y=ax+b [General Sta­tis­tics]

Hi ElMaestro,

❝ I would like to know how to judge if the correlation is quantitatively appearing to be best for the first line or for the second line.

Sumfink along this?

x1  <- c(1, 2, 3, 4, 5) y1  <- c(10.5, 11.4, 12.6, 13.3, 14.6) n1  <- length(x1) x2  <- c(1, 2, 3, 4, 5) y2  <- c(10.5, 11.4, NA, 13.3, 14.6) n2  <- length(x2) x3  <- c(1, 2, 4, 5) y3  <- c(10.5, 11.4, 13.3, 14.6) n3  <- length(x3) raw <- matrix(c(rep(1, n1), rep(2, n2), rep(3, n3),               x1, x2, x3, y1, y2, y3,               rep(n1, each=n1), rep(n2, each=n2), rep(n3, each=n3)),               nrow=n1+n2+n3, ncol=4, byrow=FALSE,               dimnames=list(NULL, c("set", "x", "y", "n"))) raw <- as.data.frame(raw); attach(raw) set <- unique(set) res <- data.frame(set=set, n=c(n1, n2, n3), NAs=NA,                   int=NA, slope=NA, RSE=NA, R2.adj=NA) for (j in seq_along(set)) {   mod              <- lm(y ~ x, data=raw, set==j)   sum.mod          <- summary(mod)   res[j, "NAs"]    <- sum(is.na(raw$y[raw$set==j]))   res[j, "int"]    <- coef(mod)[[1]]   res[j, "slope"]  <- coef(mod)[[2]]   res[j, "RSE"]    <- sum.mod$sigma res[j, "R2.adj"] <- sum.mod$adj.r.squared } print(raw, row.names=FALSE) print(res[with(res, order(-R2.adj, +n)), ], row.names=FALSE) detach(raw)

Fits linear models to the sets and gives results sorted by R²adj (descending) and n (ascending). y-values of set 2 and 3 are identical. Shows that NA gives the same result as a completely missing row due to the default na.action=na.omit in lm().

 set n NAs  int slope       RSE    R2.adj    3 4   0 9.42  1.01 0.1565248 0.9928293    2 5   1 9.42  1.01 0.1565248 0.9928293    1 5   0 9.45  1.01 0.1494434 0.9912998

Edit: Toys for boys:

a    <- 9.45 b    <- 1.01 sets <- 16 n    <- 5 x    <- 1:n sd   <- 0.25 pct  <- 20 # percentage of NAs y    <- NULL for (j in 1:sets) {   y <- c(y, a + b*x + rnorm(n, 0, sd)) } raw  <- matrix(c(rep(1:sets, each=n), rep(x, sets), y,                  rep(n, n*sets)), nrow=n*sets, ncol=4, byrow=FALSE,                dimnames=list(NULL, c("set", "x", "y", "n"))) loc  <- floor(runif(as.integer(pct*sets*n/100), min=1, max=n*sets)) raw[loc, "y"] <- NA raw  <- as.data.frame(raw); attach(raw) set  <- 1:sets res  <- data.frame(set=set, n=rep(n, sets), NAs=NA, used=NA,                   int=NA, slope=NA, RSE=NA, R2.adj=NA) op   <- par(no.readonly=TRUE) split.screen(c(4, 4)) # adjust for the number of sets! for (j in seq_along(set)) {   mod              <- lm(y ~ x, data=raw, set==j)   sum.mod          <- summary(mod)   res[j, "NAs"]    <- sum(is.na(raw$y[raw$set==j]))   res[j, "used"]   <- res[j, "n"] - res[j, "NAs"]   res[j, "int"]    <- coef(mod)[[1]]   res[j, "slope"]  <- coef(mod)[[2]]   res[j, "RSE"]    <- sum.mod$sigma res[j, "R2.adj"] <- sum.mod$adj.r.squared   screen(j)   par(mar=c(1.5, 1.5, 1, 0.1), mgp=c(2, 0.5, 0), line=2, tcl=-0.25)   if (res[j, "used"] >= 2) {     plot(raw$x[raw$set==j], raw$y[raw$set==j], las=1, xlab="", ylab="",          ylim=c((a+b*x[1])*0.95, (a+b*x[n])/0.95), cex.axis=0.65,          cex.main=0.65, font.main=1,          main=paste("Rsq.adj =", signif(res[j, "R2.adj"], 6)))     ifelse (res[j, "used"] == 2, col <- "red", col <- "blue")     abline(coef=coef(mod), col=col)     legend("topleft", cex=0.65, box.lty=0, x.intersp=0,       legend=paste0("set ", j, ": ", res[j, "used"], " used"))   } } res <- res[with(res, order(-R2.adj, +n)), ] screen(head(res$set, 1)) par(mar=c(1.5, 1.5, 1, 0.1)) legend("right", cex=0.75, box.lty=0, x.intersp=0, legend="\"best\"") screen(tail(res$set, 1)) par(mar=c(1.5, 1.5, 1, 0.1)) legend("right", cex=0.75, box.lty=0, x.intersp=0, legend="\"worst\"") close.screen(all=TRUE) par(op) detach(raw) print(res, row.names=FALSE)

Dif-tor heh smusma 🖖🏼 Довге життя Україна!
Helmut Schütz

The quality of responses received is directly proportional to the quality of the question asked. 🚮
Science Quotes