# Paul Pavlidis (c) 2003-2010 # Commands demonstrating the use of R for statistical analysis of gene # expression microarray data. This may be freely distributed, modified # and used, with attribution. # This code has been tested for R 1.6.2 under Windows 2000 and Unix # (Solaris, Linux), and updated for R 2.11 (November 2010) # Note that to avoid problems with invalid R variable names, in the # input data file all "_" have been replaced with ".". Thus in this # demo probe sets are named XXXXX.at instead of the usual Affymetrix # (tm) format XXXXX_at. # Under Windows, use the "File...change dir" menu command to go to the # directory where the sample file can be found. # Read in the data, set up variables sdata<-read.table("sandberg-sampledata.txt", header=T, row.names=1) strain <- gl(2,12,24, label=c("129","bl6")) region <- gl(6,2,24, label=c("ag", "cb", "cx", "ec", "hp", "mb")) # define ANOVA function aof <- function(x) { m<-data.frame(strain,region, x); anova(aov(x ~ strain + region + strain*region, m)) } # apply analysis to the data and get the pvalues. anovaresults <- apply(sdata, 1, aof) # Note the the following line works on the test data set, but fails # with a stack overflow on large data sets, due to an apparent bug in # R. pvalues<-data.frame( lapply(anovaresults, function(x) { x["Pr(>F)"][1:3,] }) ) # (Begin bug work-around section) # Workaround 1: The following can be used to replace the previous line. # Suggested by Tom Blackwell, U Michigan. temp.1 <- unlist(lapply(anovaresults, function(x) { x["Pr(>F)"][1:3,] })) temp.2 <- matrix(temp.1, length(anovaresults), 3, byrow=T) dimnames(temp.2) <- list(names(anovaresults), dimnames(anovaresults[[1]])[[1]][1:3]) pvalues <- data.frame(t(temp.2)) # Workaround 2 (also suggested by T. Blackwell). Replace the aof # function with the following: aof <- function(x) { m<-data.frame(strain,region, x); temp<-anova(aov(x ~ strain + region + strain*region, m)) temp["Pr(>F)"][1:3,] } # And then do the following: pvalues<-data.frame(apply(sdata, 1, aof)); # Then proceed as described below. The drawback of the preceding is # that other data in the aov objects (e.g., F statistics) are thrown # away. # (End bug work-around section) # Note that the results of all calls to write.table do not yield # correct headers for use in most other programs. This is because no # label is placed in the "upper left hand corner" of the matrix. When # programs like Excel read in such files, the line containing the # column labels is shifted to the left. This can be repaired by using # Excel or even a word processor to insert a heading such as "Probe" # at the start of the first row. write.table(t(pvalues), file="anova-results.txt", quote=F, sep='\t') # Get the genes with good region effect pvalues. # Note: removed sorting reg.hi.p <-t(data.frame(pvalues[2, pvalues[2,] < 0.0001 & pvalues[3,] > 0.1])) reg.hi.pdata <- sdata[ row.names(reg.hi.p), ] # Template matching cbtempl<-c(0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0) cor.test(t(reg.hi.pdata[14,]),cbtempl) # Run template on all the high region effect genes template.match <- function(x, template) { k<-cor.test(x,template) k$p.value } cbtempl.results <- apply(reg.hi.pdata, 1, template.match, cbtempl) write.table(cbtempl.results, file="cbtempl-results.txt", quote=F, sep='\t') # t-test demonstration. t.test(t(reg.hi.pdata[14,region!="cb"]), t(reg.hi.pdata[14,region=="cb"]), var.equal=T) # output the region genes in order of cerebellum match. write.table(reg.hi.pdata[order(cbtempl.results),], "cbtempl.pdata.txt", quote=F, sep='\t') # FDR bh <- function(x, fdr) { thresh <- F; crit<-0; len<-length(x) answer <- array(len); first <- T; for(i in c(len:0)) { crit<-fdr*i/len; if (x[i] < crit || thresh == T) { answer[i]<-T thresh <- T if (first) { cat(i ,"genes selected at FDR =", fdr ,"\n") first = F; } } else { answer[i]<-F } } answer } regionp <- sort(t(pvalues[2,])) # the p values must be sorted in increasing order! fdr.result <- bh(regionp, 0.05) # reports that 192 genes are selected bhthresh <- cbind(regionp, fdr.result) write.table(bhthresh, "bhthresh.txt", sep='\t', quote=F) # print to a file. # Figure 3a: Make a graph and save as postscript (these commands are not shown in the paper) barplot(as.matrix(reg.hi.pdata[order(cbtempl.results),][1,]), names.arg=as.vector(region), xlab="Region", ylab="Expression", col="grey", space=0) dev.copy2eps(file="example.ps") # Figure 3b, users of matrix2png can run the shell command (unix or # macosX) or use the matrix2png web interface at # http://microarray.cpmc.columbia.edu/matrix2png/ on # cbtempl.pdata.txt, choosing the 'normalize' options, choosing color # map 1 and setting the range to be -2 to 2. Note that to be used in # matrix2png, the header of cbtempl.pdata.txt must be corrected as # described above, and saved as text. For publication, the resulting # figure was relabeled in Adobe Illustrator to make figure 3b. In R # for unix, assuming matrix2png is installed on your system: system("matrix2png -data cbtempl.pdata.txt -map 1 -zrcs -size 8:18 -numr 10 -range -2:2 > cbtempl.pdata.png")