|
# To source this file into an environment to avoid cluttering the global workspace, put this in Rprofile: |
|
# my.env <- new.env(); sys.source("C:/PathTo/THIS_FILE.r", my.env); attach(my.env) |
|
|
|
#----------------------------------------------------------------------- |
|
# Load packages, set options and cwd, set up database connection |
|
#----------------------------------------------------------------------- |
|
|
|
## Load packages |
|
# require(ggplot2) #plotting |
|
# require(plyr) #data manipulation |
|
# require(reshape) #data manipulation |
|
# require(sqldf) #manipulate data frame with SQL |
|
|
|
## Sets the working directory to C:/R |
|
setwd("~/R") |
|
|
|
## Don't show those silly significanct stars |
|
options(show.signif.stars=FALSE) |
|
|
|
## Do you want to automatically convert strings to factor variables in a data.frame? |
|
options(stringsAsFactors=TRUE) |
|
|
|
## Hard code the US repository for CRAN so it doesn't ask me every time. |
|
r <- getOption("repos") |
|
r["CRAN"] <- "http://cran.us.r-project.org" |
|
options(repos = r) |
|
rm(r) |
|
|
|
## Some SQLite stuff I don't use any more because I switched to MySQL |
|
# require(RSQLite) |
|
# channel <- dbConnect(SQLite(), "C:/cygwin/home/sturner/dbs/sdt.sqlite") |
|
# query <- function(...) dbGetQuery(channel,...) |
|
|
|
## Set up ODBC connection for MySQL localhost, and make it easy to query a database with query() function. |
|
require(RODBC) # The rprofile script will fail here if you don't have RODBC installed. |
|
channel <- odbcConnect("localhost") |
|
query <- function(...) sqlQuery(channel, ...) |
|
|
|
#----------------------------------------------------------------------- |
|
# Functions |
|
#----------------------------------------------------------------------- |
|
|
|
## Transpose a numeric data frame with ID in first column |
|
tdf <- function(d) { |
|
row.names(d) <- d[[1]] |
|
d[[1]] <- NULL |
|
d <- as.data.frame(t(d)) |
|
d$id <- row.names(d) |
|
d <- cbind(d[ncol(d)], d[-ncol(d)]) |
|
row.names(d) <- NULL |
|
d |
|
} |
|
|
|
## Convert selected columns of a data frame to factor variables |
|
factorcols <- function(d, ...) lapply(d, function(x) factor(x, ...)) |
|
|
|
## Returns a logical vector TRUE for elements of X not in Y |
|
"%nin%" <- function(x, y) !(x %in% y) |
|
|
|
## Returns names(df) in single column, numbered matrix format. |
|
n <- function(df) matrix(names(df)) |
|
|
|
## Single character shortcuts for summary() and head(). |
|
s <- base::summary |
|
h <- utils::head |
|
|
|
## ht==headtail, i.e., show the first and last 10 items of an object |
|
ht <- function(d) rbind(head(d,10),tail(d,10)) |
|
|
|
## Show the first 5 rows and first 5 columns of a data frame or matrix |
|
hh <- function(d) d[1:5,1:5] |
|
|
|
## Read data on clipboard. |
|
read.cb <- function(...) { |
|
ismac <- Sys.info()[1]=="Darwin" |
|
if (!ismac) read.table(file="clipboard", ...) |
|
else read.table(pipe("pbpaste"), ...) |
|
} |
|
|
|
## Open current directory on mac |
|
macopen <- function(...) system("open .") |
|
|
|
## name("test.png") results in "C:/R/2010-04-20-test.png" if running this in C:/R on April 20 2010. |
|
name <- function(filename="filename") paste(getwd(),"/",Sys.Date(),"-",filename,sep="") |
|
|
|
## Takes a dataframe and a column name, and moves that column to the front of the DF. |
|
moveColFront <- function(d=dataframe, colname="colname") { |
|
index <- match(colname, names(d)) |
|
cbind(d[index],d[-index]) |
|
} |
|
|
|
## Permutes a column in a data.frame, sets seed optionally |
|
permute <- function (dataframe, columnToPermute="column", seed=NULL) { |
|
if (!is.null(seed)) set.seed(seed) |
|
colindex <- which(names(dataframe)==columnToPermute) |
|
permutedcol <- dataframe[ ,colindex][sample(1:nrow(dataframe))] |
|
dataframe[colindex] <- permutedcol |
|
return(dataframe) |
|
} |
|
|
|
## Summarize missing data in a data frame. Return a list (lpropmiss) or data frame (propmiss) |
|
lpropmiss <- function(dataframe) lapply(dataframe,function(x) data.frame(nmiss=sum(is.na(x)), n=length(x), propmiss=sum(is.na(x))/length(x))) |
|
propmiss <- function(dataframe) { |
|
m <- sapply(dataframe, function(x) { |
|
data.frame( |
|
nmiss=sum(is.na(x)), |
|
n=length(x), |
|
propmiss=sum(is.na(x))/length(x) |
|
) |
|
}) |
|
d <- data.frame(t(m)) |
|
d <- sapply(d, unlist) |
|
d <- as.data.frame(d) |
|
d$variable <- row.names(d) |
|
row.names(d) <- NULL |
|
d <- cbind(d[ncol(d)],d[-ncol(d)]) |
|
return(d[order(d$propmiss), ]) |
|
} |
|
|
|
## Make a pretty QQ plot of p-values |
|
qq = function(pvector, ...) { |
|
if (!is.numeric(pvector)) stop("D'oh! P value vector is not numeric.") |
|
pvector <- pvector[!is.na(pvector) & pvector<1 & pvector>0] |
|
o = -log10(sort(pvector,decreasing=F)) |
|
#e = -log10( 1:length(o)/length(o) ) |
|
e = -log10( ppoints(length(pvector) )) |
|
plot(e,o,pch=19,cex=1, xlab=expression(Expected~~-log[10](italic(p))), ylab=expression(Observed~~-log[10](italic(p))), xlim=c(0,max(e)), ylim=c(0,max(o)), ...) |
|
abline(0,1,col="red") |
|
} |
|
|
|
## Draw a histogram with normal overlay (From http://www.statmethods.net/graphs/density.html) |
|
histnormal <- function(d, main=NULL, xlab=NULL, breaks="FD", ...) { |
|
if (any(is.na(d))) warning(paste(sum(is.na(d)), "missing values")); d <- na.omit(d) |
|
h <- hist(d, plot=FALSE, breaks=breaks, ...) |
|
x <- seq(min(d), max(d), length=40) |
|
y <- dnorm(x, mean=mean(d), sd=sd(d)) |
|
y <- y*diff(h$mids[1:2])*length(d) |
|
hist(d, col="gray50", main=main, xlab=xlab, ylim=c(0,max(y)), breaks=breaks,...) |
|
lines(x,y, col="blue", lwd=2) |
|
rug(x) |
|
} |
|
|
|
## Draw a histogram with density overlay |
|
histdensity <- function(x, main=NULL, breaks="FD", ...) { |
|
if (any(is.na(x))) warning(paste(sum(is.na(x)), "missing values")); x <- na.omit(x) |
|
hist(x, col="gray50", probability=TRUE, breaks=breaks, main=main, ...) |
|
lines(density(x, na.rm = TRUE), col = "blue", lwd=2) |
|
rug(x) |
|
} |
|
|
|
## Plot scatterplot with trendline and confidence interval (From http://tinyurl.com/3bvrth7) |
|
scatterci <- function(x, y, ...) { |
|
plot(x, y, ...) |
|
mylm <- lm(y~x) |
|
abline(mylm, col="blue") |
|
x=sort(x) |
|
prd<-predict(mylm,newdata=data.frame(x=x),interval = c("confidence"), level = 0.95) |
|
lines(x,prd[,2],col="blue",lty=3) |
|
lines(x,prd[,3],col="blue",lty=3) |
|
} |
|
|
|
## Get the proportion variation explained. See this website for more details: http://goo.gl/jte8X |
|
rsq <- function(predicted, actual) 1-sum((actual-predicted)^2)/sum((actual-mean(actual))^2) |
|
|
|
## Correlation matrix with p-values. See http://goo.gl/nahmV for documentation of this function |
|
cor.prob <- function(X, dfr = nrow(X) - 2) { |
|
R <- cor(X) |
|
above <- row(R) < col(R) |
|
r2 <- R[above]^2 |
|
Fstat <- r2 * dfr / (1 - r2) |
|
R[above] <- 1 - pf(Fstat, 1, dfr) |
|
R[row(R)==col(R)]<-NA |
|
R |
|
} |
|
|
|
## This function accepts a GLM object and does a LR chi-square test on the fit. |
|
lrt <- function (modelobject) { |
|
lrtest.chi2 <- model$null.deviance - model$deviance # Difference in deviance between model with intercept only and full model. This is the likelihood ratio test statistic (-2(log(L))). |
|
lrtest.df <- model$df.null - model$df.residual # Difference in DF. Make sure this equals the number of predictors in the model! |
|
fitpval <- 1-pchisq(lrtest.chi2,lrtest.df) |
|
cat("Likelihood ratio test on model fit:\n\n") |
|
data.frame(lrtest.chi2=lrtest.chi2,lrtest.df=lrtest.df,fitpval=fitpval) #Output gives you the chisquare, df, and p-value. |
|
} |
|
|
|
## This function does the same thing as lrtest in the Design package, but doesn't do as much checking. |
|
## Remember, the lrt has to test the same model (model fit on same observations) |
|
## Also the drop1(fullmodel,test="Chisq") does something similar. |
|
lrt2 <- function (full,reduced) { |
|
if (reduced$deviance<=full$deviance) stop ("Reduced model not worse than full.") |
|
if (reduced$df.residual<=full$df.residual) stop ("Reduced model doesn't have more degrees of freedom.") |
|
lrtest.chi2 <- reduced$deviance-full$deviance |
|
lrtest.df <- reduced$df.residual - full$df.residual |
|
fitpval <- 1-pchisq(lrtest.chi2,lrtest.df) |
|
cat("Likelihood ratio test on two models:\n\n") |
|
data.frame(lrtest.chi2=lrtest.chi2,lrtest.df=lrtest.df,fitpval=fitpval) |
|
} |
|
|
|
## This gets the overall anova p-value out of a linear model object |
|
lmp <- function (modelobject) { |
|
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ") |
|
f <- summary(modelobject)$fstatistic |
|
p <- pf(f[1],f[2],f[3],lower.tail=F) |
|
attributes(p) <- NULL |
|
return(p) |
|
} |
|
|
|
## Function for arranging ggplots. use png(); arrange(p1, p2, ncol=1); dev.off() to save. |
|
vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y) |
|
arrange_ggplot2 <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) { |
|
dots <- list(...) |
|
n <- length(dots) |
|
if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)} |
|
if(is.null(nrow)) { nrow = ceiling(n/ncol)} |
|
if(is.null(ncol)) { ncol = ceiling(n/nrow)} |
|
## NOTE see n2mfrow in grDevices for possible alternative |
|
grid.newpage() |
|
pushViewport(viewport(layout=grid.layout(nrow,ncol) ) ) |
|
ii.p <- 1 |
|
for(ii.row in seq(1, nrow)){ |
|
ii.table.row <- ii.row |
|
if(as.table) {ii.table.row <- nrow - ii.table.row + 1} |
|
for(ii.col in seq(1, ncol)){ |
|
ii.table <- ii.p |
|
if(ii.p > n) break |
|
print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col)) |
|
ii.p <- ii.p + 1 |
|
} |
|
} |
|
} |
|
|
|
## Imputes the median value of a vector, matrix, or data frame. |
|
## Stolen from na.roughfix function in the randomForest package. |
|
na.roughfix <- function (object=NULL, ...) { |
|
if (class(object) == "data.frame") { |
|
isfac <- sapply(object, is.factor) |
|
isnum <- sapply(object, is.numeric) |
|
if (any(!(isfac | isnum))) |
|
stop("dfMedianImpute only works for numeric or factor") |
|
roughfix <- function(x) { |
|
if (any(is.na(x))) { |
|
if (is.factor(x)) { |
|
freq <- table(x) |
|
x[is.na(x)] <- names(freq)[which.max(freq)] |
|
} |
|
else { |
|
x[is.na(x)] <- median(x, na.rm = TRUE) |
|
} |
|
} |
|
x |
|
} |
|
object[] <- lapply(object, roughfix) |
|
return(object) |
|
} |
|
else if(is.atomic(object)) { |
|
d <- dim(object) |
|
if (length(d) > 2) |
|
stop("vectorMedianImpute can't handle objects with more than two dimensions") |
|
if (all(!is.na(object))) |
|
return(object) |
|
if (!is.numeric(object)) |
|
stop("vectorMedianImpute can only deal with numeric data.") |
|
if (length(d) == 2) { |
|
hasNA <- which(apply(object, 2, function(x) any(is.na(x)))) |
|
for (j in hasNA) object[is.na(object[, j]), j] <- median(object[, |
|
j], na.rm = TRUE) |
|
} |
|
else { |
|
object[is.na(object)] <- median(object, na.rm = TRUE) |
|
} |
|
return(object) |
|
} |
|
else stop("Object is not a data frame or atomic vector") |
|
} |
|
|
|
## Makes a better scatterplot matrix. |
|
## Stolen from the PerformanceAnalytics package: http://cran.r-project.org/web/packages/PerformanceAnalytics/index.html |
|
## Also see http://moderntoolmaking.blogspot.com/2011/08/graphically-analyzing-variable.html |
|
## To color code points based on levels of a factor, use these args: |
|
## pairs.perfan(d, bg=c("red","blue")[d$factor], pch=21) |
|
betterpairs <- function (R, histogram = TRUE, ...) |
|
{ |
|
x=as.matrix(R) # in PerformanceAnalytics: x = checkData(R, method = "matrix") |
|
if (mode(x)!="numeric") stop("Must pass in only numeric values") |
|
panel.cor <- function(x, y, digits = 2, prefix = "", use = "pairwise.complete.obs", cex.cor, ...) { |
|
usr <- par("usr") |
|
on.exit(par(usr)) |
|
par(usr = c(0, 1, 0, 1)) |
|
r <- abs(cor(x, y, use = use)) |
|
txt <- format(c(r, 0.123456789), digits = digits)[1] |
|
txt <- paste(prefix, txt, sep = "") |
|
if (missing(cex.cor)) cex <- 0.8/strwidth(txt) |
|
test <- cor.test(x, y) |
|
Signif <- symnum(test$p.value, corr = FALSE, na = FALSE, cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), symbols = c("***", "**", "*", ".", " ")) |
|
text(0.5, 0.5, txt, cex = cex * r) |
|
text(0.8, 0.8, Signif, cex = cex, col = 2) |
|
} |
|
f <- function(t) dnorm(t, mean = mean(x), sd = sd(x)) |
|
# Useful function for histogram showing density overlay and rug |
|
hist.panel = function(x, ...) { |
|
par(new = TRUE) |
|
hist(x, col = "light gray", probability = TRUE, axes = FALSE, main = "", breaks = "FD") |
|
lines(density(x, na.rm = TRUE), col = "red", lwd = 1) |
|
rug(x) |
|
} |
|
if (histogram) pairs(x, gap = 0, lower.panel = panel.smooth, upper.panel = panel.cor, diag.panel = hist.panel, ...) |
|
else pairs(x, gap = 0, lower.panel = panel.smooth, upper.panel = panel.cor, ...) |
|
} |
|
|
|
# Did you make it this far? |
|
message("\n******************************\nSuccessfully loaded Rprofile.r\n******************************") |