# Paul Pavlidis (c) 2003-2010

# Commands demonstrating the use of R for statistical analysis of gene
# expression microarray data. This may be freely distributed, modified
# and used, with attribution.

# This code has been tested for R 1.6.2 under Windows 2000 and Unix
# (Solaris, Linux), and updated for R 2.11 (November 2010)

# Note that to avoid problems with invalid R variable names, in the
# input data file all "_" have been replaced with ".". Thus in this
# demo probe sets are named XXXXX.at instead of the usual Affymetrix
# (tm) format XXXXX_at.

# Under Windows, use the "File...change dir" menu command to go to the
# directory where the sample file can be found.

# Read in the data, set up variables
sdata<-read.table("sandberg-sampledata.txt", header=T, row.names=1)
strain <- gl(2,12,24, label=c("129","bl6"))
region <- gl(6,2,24, label=c("ag", "cb", "cx", "ec", "hp", "mb"))

# define ANOVA function
aof <- function(x) { 
  m<-data.frame(strain,region, x); 
  anova(aov(x ~ strain + region + strain*region, m))
}

# apply analysis to the data and get the pvalues.
anovaresults <- apply(sdata, 1, aof)

# Note the the following line works on the test data set, but fails
# with a stack overflow on large data sets, due to an apparent bug in
# R.
pvalues<-data.frame( lapply(anovaresults, function(x) { x["Pr(>F)"][1:3,] }) )

# (Begin bug work-around section)
# Workaround 1: The following can be used to replace the previous line.
# Suggested by Tom Blackwell, U Michigan.
temp.1 <- unlist(lapply(anovaresults, function(x) { x["Pr(>F)"][1:3,] }))
temp.2 <- matrix(temp.1, length(anovaresults), 3, byrow=T)
dimnames(temp.2) <- list(names(anovaresults), dimnames(anovaresults[[1]])[[1]][1:3])
pvalues <- data.frame(t(temp.2))

# Workaround 2 (also suggested by T. Blackwell). Replace the aof
# function with the following:
aof <- function(x) { 
  m<-data.frame(strain,region, x); 
  temp<-anova(aov(x ~ strain + region + strain*region, m))
  temp["Pr(>F)"][1:3,]
}
# And then do the following:
pvalues<-data.frame(apply(sdata, 1, aof));
# Then proceed as described below.  The drawback of the preceding is
# that other data in the aov objects (e.g., F statistics) are thrown
# away.

# (End bug work-around section)

# Note that the results of all calls to write.table do not yield
# correct headers for use in most other programs. This is because no
# label is placed in the "upper left hand corner" of the matrix. When
# programs like Excel read in such files, the line containing the
# column labels is shifted to the left. This can be repaired by using
# Excel or even a word processor to insert a heading such as "Probe"
# at the start of the first row.
write.table(t(pvalues), file="anova-results.txt", quote=F, sep='\t')


# Get the genes with good region effect pvalues. # Note: removed sorting 
reg.hi.p <-t(data.frame(pvalues[2, pvalues[2,] < 0.0001 & pvalues[3,] > 0.1]))
reg.hi.pdata <- sdata[ row.names(reg.hi.p), ]

# Template matching
cbtempl<-c(0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0)
cor.test(t(reg.hi.pdata[14,]),cbtempl)

# Run template on all the high region effect genes
template.match <- function(x, template) {
  k<-cor.test(x,template)
  k$p.value
}
cbtempl.results <- apply(reg.hi.pdata, 1, template.match, cbtempl)
write.table(cbtempl.results, file="cbtempl-results.txt", quote=F, sep='\t')

# t-test demonstration.
t.test(t(reg.hi.pdata[14,region!="cb"]), t(reg.hi.pdata[14,region=="cb"]), var.equal=T)

# output the region genes in order of cerebellum match.
write.table(reg.hi.pdata[order(cbtempl.results),], "cbtempl.pdata.txt", quote=F, sep='\t')

# FDR
bh <- function(x, fdr) {
  thresh <- F;
  crit<-0;
  len<-length(x)
  answer <- array(len);
  first <- T;
  for(i in c(len:0)) {
    crit<-fdr*i/len;
    if (x[i] < crit || thresh == T) {
      answer[i]<-T
      thresh <- T
      if (first) {
        cat(i ,"genes selected at FDR =", fdr ,"\n")
        first = F;
      }
    } else {
      answer[i]<-F
    }
  }
  answer
}

regionp <- sort(t(pvalues[2,])) # the p values must be sorted in increasing order!
fdr.result <- bh(regionp, 0.05) # reports that 192 genes are selected
bhthresh <- cbind(regionp, fdr.result)
write.table(bhthresh, "bhthresh.txt", sep='\t', quote=F) # print to a file.

# Figure 3a: Make a graph and save as postscript (these commands are not shown in the paper)
barplot(as.matrix(reg.hi.pdata[order(cbtempl.results),][1,]), names.arg=as.vector(region), xlab="Region", ylab="Expression", col="grey", space=0)
dev.copy2eps(file="example.ps")

# Figure 3b, users of matrix2png can run the shell command (unix or
# macosX) or use the matrix2png web interface at
# http://microarray.cpmc.columbia.edu/matrix2png/ on
# cbtempl.pdata.txt, choosing the 'normalize' options, choosing color
# map 1 and setting the range to be -2 to 2. Note that to be used in
# matrix2png, the header of cbtempl.pdata.txt must be corrected as
# described above, and saved as text. For publication, the resulting
# figure was relabeled in Adobe Illustrator to make figure 3b. In R
# for unix, assuming matrix2png is installed on your system:
system("matrix2png -data cbtempl.pdata.txt -map 1 -zrcs -size 8:18 -numr 10 -range -2:2 > cbtempl.pdata.png")