prediction.interval <- function (l, alpha, x) { # Given a linear model l, and a set of x points, return # a matrix with the prediction intervals yh <- x * l$coefficients[2] + l$coefficients[1] N <- nrow(l$model) mse <- sum(l$residuals^2) / (N-3) xbar <- mean(l$model[,2]) xrs <- sum( (l$model[,2] - xbar)^2 ) pi <- qt(alpha/2, N - 2) * sqrt (mse + mse * (x - xbar)^2 / ( N * xrs ) ) cbind(yh + pi, yh - pi) } N<-200 a<-7 c<-2 s<-0.8 alpha <- 1 - 0.8 box<-c(-1,-10, 2,5); xr<-c(box[1], box[3]); yr<-c(box[2], box[4]); x<-rnorm(N) y<-a*x + c + b*rnorm(N, sd=s) l<-lm(y ~ x) a_hat <- l$coefficients[[2]] c_hat <- l$coefficients[[1]] yhr <- (yr - c_hat) / a_hat xh <- seq(from=min(x), to=max(x), by = diff(range(x)) / 100) pi <- prediction.interval (l, alpha, xh) # selection range sr<-c(max(xr[1], yhr[1]), min(xr[2], yhr[2])) sel<-x >= sr[1] & x <= sr[2] query <- (x >= xr[1] & x <= xr[2] & y >= yr[1] & y <= yr[2]) m<-data.frame(matrix(nrow=2, ncol=2)) colnames(m)<-c('+', '-') rownames(m)<-colnames(m) m[1,1] <- sum(sel & query) m[1,2] <- sum(sel & !query) m[2,1] <- sum(!sel & query) m[2,2] <- sum(!sel & !query) sens <- m[1,1] / (m[1,1] + m[2,1]) spec <- m[2,2] / (m[2,2] + m[1,2]) # CHARTS png('fig1.png', width=1000,height=300) par(mfrow=c(1,3)) plot(x,y, pch='+') hist(x) hist(y) dev.off() png('fig2.png', width=400,height=400) plot(x,y, pch='+') abline(c_hat, a_hat, col='red', lty=2) rect(box[1], box[2], box[3], box[4], col="transparent", border="blue", lty=2, lwd=2) dev.off() png('fig3.png', width=400,height=400) plot(x,y, pch='+') abline(c_hat, a_hat, col='red', lty=2) lines(c(xr[1], xr[1]), range(y), col='green') lines(c(xr[2], xr[2]), range(y), col='green') lines(c(yhr[1], yhr[1]), range(y), col='green', lty=2) lines(c(yhr[2], yhr[2]), range(y), col='green', lty=2) rect(box[1], box[2], box[3], box[4], col="transparent", border="blue", lty=2, lwd=2) dev.off() png('fig4.png', width=400,height=400) plot(x,y, pch='+') abline(c_hat, a_hat, col='red', lty=2) lines(c(sr[1], sr[1]), range(y)*0.7, col='orange', lty=2) lines(c(sr[2], sr[2]), range(y)*0.7, col='orange', lty=2) rect(box[1], box[2], box[3], box[4], col="transparent", border="blue", lty=2, lwd=2) points(x[sel], y[sel], pch='+', col='green') text(mean(sr),max(y)-1, sprintf('Sensitivity: %4.2f', sens), cex=0.75) text(mean(sr),max(y)-3, sprintf('Specificity: %4.2f', spec), cex=0.75) dev.off() # i cheat in my calculations here. so shoot me. can't be bothered wiriting the inverse interval. d <- diff(prediction.interval (l, alpha, yhr[2])[1,]) / 2 sr[2] <- yhr[2] + d / a_hat sel<-x >= sr[1] & x <= sr[2] query <- (x >= xr[1] & x <= xr[2] & y >= yr[1] & y <= yr[2]) m<-data.frame(matrix(nrow=2, ncol=2)) colnames(m)<-c('+', '-') rownames(m)<-colnames(m) m[1,1] <- sum(sel & query) m[1,2] <- sum(sel & !query) m[2,1] <- sum(!sel & query) m[2,2] <- sum(!sel & !query) sens <- m[1,1] / (m[1,1] + m[2,1]) spec <- m[2,2] / (m[2,2] + m[1,2]) png('fig5.png', width=400,height=400) plot(x,y, pch='+') abline(c_hat, a_hat, col='red', lty=2) lines(xh, pi[,1], col='red', lty=3); lines(xh, pi[,2], col='red', lty=3) lines(c(sr[1], sr[1]), range(y)*0.7, col='orange', lty=2) lines(c(sr[2], sr[2]), range(y)*0.7, col='orange', lty=2) rect(box[1], box[2], box[3], box[4], col="transparent", border="blue", lty=2, lwd=2) points(x[sel], y[sel], pch='+', col='green') text(mean(sr),max(y)-1, sprintf('Sensitivity: %4.2f', sens), cex=0.75) text(mean(sr),max(y)-3, sprintf('Specificity: %4.2f', spec), cex=0.75) dev.off() med<-median(x) # cheating again here.. d <- diff(prediction.interval (l, alpha, med)[1,]) / (2 * a_hat) # because we are using the median as the split, instead of the mean, rlim and llim might differ. rlim <- prediction.interval (l, alpha, med+d)[,1] llim <- prediction.interval (l, alpha, med-d)[,2] outliers <- ((x >= med+d) & (y < rlim)) | ((x <= med-d) & (y > llim)) png('fig6.png', width=400,height=400) plot(x,y, pch='+') abline(c_hat, a_hat, col='red', lty=2) lines(xh, pi[,1], col='red', lty=3); lines(xh, pi[,2], col='red', lty=3) abline(v=med, col='orange', lwd=2) abline(v=med-d, col='orange', lty=2) abline(v=med+d, col='orange', lty=2) lines(c(med+d, max(x)), rep(rlim, 2), col='purple', lty=2) lines(c(med-d, min(x)), rep(llim, 2), col='purple', lty=2) points(x[outliers],y[outliers], pch='o', col='red') dev.off()