hotpal<-c(rgb( 0 , 0 , 0 ), rgb( 0.058823529 , 0 , 0 ), rgb( 0.117647059 , 0 , 0 ), rgb( 0.176470588 , 0 , 0 ), rgb( 0.235294118 , 0 , 0 ), rgb( 0.294117647 , 0 , 0 ), rgb( 0.352941176 , 0 , 0 ), rgb( 0.411764706 , 0 , 0 ), rgb( 0.470588235 , 0 , 0 ), rgb( 0.529411765 , 0.058823529 , 0 ), rgb( 0.588235294 , 0.117647059 , 0 ), rgb( 0.647058824 , 0.176470588 , 0 ), rgb( 0.705882353 , 0.235294118 , 0 ), rgb( 0.764705882 , 0.294117647 , 0 ), rgb( 0.823529412 , 0.352941176 , 0 ), rgb( 0.882352941 , 0.411764706 , 0 ), rgb( 0.941176471 , 0.470588235 , 0 ), rgb( 1 , 0.529411765 , 0 ), rgb( 1 , 0.588235294 , 0 ), rgb( 1 , 0.647058824 , 0 ), rgb( 1 , 0.705882353 , 0 ), rgb( 1 , 0.764705882 , 0 ), rgb( 1 , 0.823529412 , 0 ), rgb( 1 , 0.882352941 , 0 ), rgb( 1 , 0.941176471 , 0 ), rgb( 1 , 1 , 0.141176471 ), rgb( 1 , 1 , 0.28627451 ), rgb( 1 , 1 , 0.42745098 ), rgb( 1 , 1 , 0.57254902 ), rgb( 1 , 1 , 0.71372549 ), rgb( 1 , 1 , 0.858823529 ), rgb( 1 , 1 , 1 )); greypal<-c( rgb(0,0,0), rgb(0.1,0.1,0.1), rgb(0.2,0.2,0.2), rgb(0.3,0.3,0.3), rgb(0.4,0.4,0.4), rgb(0.5,0.5,0.5), rgb(0.6,0.6,0.6), rgb(0.7,0.7,0.7), rgb(0.8,0.8,0.8), rgb(0.9,0.9,0.9), rgb(1,1,1)); # same as rs, but for probe comparisons within a data set for the same gene. cross.corr.within<-function(data, array, method="pearson", minpresent=1) { rprobes<-vector(length=25000); sprobes<-vector(length=25000); gene.names<-vector(length=25000); values<-vector(length=25000); morevalues<-vector(length=25000); explev.s<-vector(length=25000); explev.r<-vector(length=25000); te<-function(x,y){cor.test(x,y, method=method, alternative="greater", na.action="omit")$p.value; }; tm<-function(x,y){cor(x,y, use="pairwise.complete.obs", method=method)[1,1]}; j<-1; genes<-as.vector(as.character(array[,"gene"])); unique.genes<-unique(genes); for (gene in unique.genes) { # find probes for this gene. if (gene == "---" || gene == "") { next; } probes<-row.names(array)[which(genes == gene )]; if (length(probes) < 2) { next; } for(i in 1:length(probes)) { dat.a<-as.vector(data[probes[i],]); for (k in (i+1):length(probes)) { if (is.na(probes[k])) { break; } print(paste(gene, " ", probes[i], " ", probes[k])) dat.b<-as.vector(data[probes[k],]); try(silent=T, values[j]<-te(t(dat.a), t(dat.b))); morevalues[j]<-tm(t(dat.a), t(dat.b)); explev.s[j]<-median(t(dat.a), na.rm=TRUE); explev.r[j]<-median(t(dat.b), na.rm=TRUE); rprobes[j]<-probes[i]; sprobes[j]<-probes[k]; gene.names[j]<-gene; j<-j+1; } } } values<-values[1:j-1]; explev.s<-explev.s[1:j-1]; explev.r<-explev.r[1:j-1]; rprobes<-rprobes[1:j-1]; sprobes<-sprobes[1:j-1]; morevalues<-morevalues[1:j-1]; gene.names<-gene.names[1:j-1]; dm<-data.frame (cbind(gene.names, rprobes, sprobes), cbind(values, explev.r, explev.s, morevalues)); names(dm)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl"); dm; } # compare two data sets - the expression vectors are compared to each # other, gene by gene. This obviously only makes sense for special # cases (such as the same dat aranalyzed more than once, or the nci60 # data) rs<-function(rdataset, sdataset, rarray, sarray, method="pearson" , unroll=FALSE, test=FALSE, minpresent=1, f=NULL){ # initialize the data structure. Must be a better way... rprobes<-vector(length=25000); sprobes<-vector(length=25000); gene.names<-vector(length=25000); values<-vector(length=25000); morevalues<-vector(length=25000); explev.s<-vector(length=25000); explev.r<-vector(length=25000); if ( unroll) { values<-matrix(nrow=100000, ncol=2); } j<-1; te<-NULL; tm<-NULL; if (!is.null(f)) { te<-f; } else if (test ){ print("In pvalue mode"); te<-function(x,y){cor.test(x,y, method=method, alternative="greater", na.action="omit")$p.value; }; tm<-function(x,y){cor(x,y, use="pairwise.complete.obs", method=method)[1,1]}; } else { te<-function(x,y){cor(x,y, use="pairwise.complete.obs", method=method)[1,1]}; } totalpoints<-0; # we use the rarray genes as the "gold standard" list, arbitrarily. rgenes<-as.vector(as.character(rarray[,"gene"])); unique.genes<-unique(rgenes); sgenes<-as.vector(as.character(sarray[,"gene"])); ngenes.tested<-0 for (gene in unique.genes) { # find probes for this gene. if (gene == "---" || gene == "") { next; } r.probes<-row.names(rarray)[which(rgenes == gene )] s.probes<-row.names(sarray)[which(sgenes == gene )] if (length(s.probes) == 0 || length(r.probes) == 0) { next; } for (rname in r.probes) { if (!is.null(dim(rdataset))) { rdat<-as.vector(rdataset[rname,]); # data for this probe. } else { rdat<-rdataset[rname]; } if (length(which(!is.na(rdat) )) < minpresent) { # exclude data that is missing too much # print("skipping r"); next; } for (sname in s.probes) { if (!is.null(dim(sdataset))) { sdat<-as.vector(sdataset[sname,]); # data for this probe. } else { sdat<-sdataset[sname]; } if (length(which(!is.na(sdat) )) < minpresent) { # exclude data that is missing too much # print("skipping s"); next; } if ( test) { # pvalue for correlation try(silent=T, values[j]<-te(t(rdat), t(sdat))); morevalues[j]<-tm(t(rdat), t(sdat)); explev.s[j]<-median(t(sdat), na.rm=TRUE); explev.r[j]<-median(t(rdat), na.rm=TRUE); rprobes[j]<-rname; sprobes[j]<-sname; gene.names[j]<-gene; j<-j+1; } else if (unroll) { # the actual expression values. for(u in seq(along=rdat)) { if (is.na(rdat[u]) || is.na(sdat[u]) ) { next; } values[j,1] <- as.numeric(sdat[u]); values[j,2] <- as.numeric(rdat[u]); j<-j+1; totalpoints<-totalpoints+1; } } else { # the default. values[j]<-te(t(rdat), t(sdat)); explev.s[j]<-median(t(sdat), na.rm=TRUE); explev.r[j]<-median(t(rdat), na.rm=TRUE); rprobes[j]<-rname; sprobes[j]<-sname; gene.names[j]<-gene; j<-j+1; } } } ngenes.tested<-ngenes.tested+1; if ((ngenes.tested %% 500) == 0 ) { write(paste(ngenes.tested, " genes tested"), stderr()); } } # get rid of empty values. if (unroll) { values<-values[1:totalpoints,]; } else { values<-values[1:j-1]; explev.s<-explev.s[1:j-1]; explev.r<-explev.r[1:j-1]; rprobes<-rprobes[1:j-1]; sprobes<-sprobes[1:j-1]; morevalues<-morevalues[1:j-1]; gene.names<-gene.names[1:j-1]; } if (test) { print("making data frame with pvalues and correlations"); dm<-data.frame (cbind(gene.names, rprobes, sprobes), cbind(values, explev.r, explev.s, morevalues)); names(dm)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl"); } else { dm<-data.frame (cbind(gene.names, rprobes, sprobes), cbind(values, explev.r, explev.s)); names(dm)<-c("gene", "rprobe", "sprobe", "correl", "explev.r", "explev.s"); } dm; } # make graphs to show some randomly selected examples. Robust line fits using rlm. #par(mfrow=c(5,10)) library(MASS) rsplot<-function(rdataset=ross, sdataset, rarray=imr, sarray=u6k){ m<-vector(); j<-0; random<-round(runif(n=length(unique(as.character(rarray[,1]))), min=1,max=length(unique(as.character(rarray[,1])))), 0) for (i in unique(as.character(rarray[,1]))[random]) { rn<-row.names(rarray)[which(rarray[,1] == i)] sn<-row.names(sarray)[which(sarray[,1]==i)] if (length(sn) == 0) { next; } for (rname in rn) { rdat<-rdataset[rname,]; if (length(which(!is.na(rdat))) < 10) { next; } for (sname in sn) { sdat<-sdataset[sname,]; if (j >= 50) { break; } plot(t(rdat), t(sdat), pch=20, cex=1, main=i, xlab="", ylab=""); abline(rlm(t(sdat) ~ t(rdat)), col="blue") j<-j+1; } } if (j >= 50) { break; } } } plotdi<-function(dataset="rma", probe) { gene<-NULL; if (dataset == "rma") { gene<-affy.annots[probe,1]; } else { gene<-illu.annots[probe,1]; } data<-eval(parse(text=paste(sep='', dataset, "[\"", probe, "\",]"))) plot(dilution, data, pch=20, xlab="Dilution", ylab="Expression", main=paste(dataset, " ", probe, " : ", gene), cex=2) abline(lm(t(data) ~ dilution), col="blue") } # for 3' distance analysis bestdist<-function (dataset) { d<-NULL; for(probe in as.character(unique(dataset[,"probe"]))) { best<-min(dataset[dataset$probe==probe, "threeprimeDist"]); d<-rbind(d,list(probe,best)) } d<-data.frame(row.names=unlist(d[,1]), unlist(d[,2])) names(d)<-c("threeprimeDist") d } # Sun Mar 20 20:08:17 EST 2005, updated Mon Apr 11 14:35:14 EDT 2005 to include exon overlap. # Wed Apr 20 15:53:16 EDT 2005 updated to filter out weak hits and count as "hits" only good hits. # Mon Apr 25 16:58:20 EDT 2005 Improving performance. bestgenes<-function(k) { d<-NULL; v<-0; alp<-as.character(k[,"probe"]); up<-unique(as.character(k[,"probe"])); blatscores<-k[,"Blat.Score"] exonoverlaps<-as.vector(k[,"exonoverlap"]); probelengths<-as.vector(k[,"Probe.Length"]); sapply(up, function(i) { rrs<-match(i, alp); maxscore<-0; maxindex<--1; maxblatscore<-0; num.hits<-0; maxoverlap<-0; sapply(rrs, function(r) { v<<-v+1; blatscore<-blatscores[r]; if (blatscore >= 0.9) { overlap<<-exonoverlaps[r]/probelengths[r]; score <- blatscore * overlap; if (score >= maxscore) { maxscore<<-score; maxblatscore<<-blatscore; maxindex<<-r; maxoverlap<<-overlap; } num.hits<<-num.hits+1; } if (v %% 100 == 0 ) { print(v); } }); if (maxindex >= 0) { d<<-rbind(d,list(i, maxblatscore, as.character(k[maxindex,"Gene"]), as.character(k[maxindex,"Transcript"]), num.hits, k[maxindex,"threeprimeDist"], k[maxindex,"exonoverlap"]/k[maxindex,"Probe.Length"])); } }); # d<-data.frame(row.names=unlist(d[,1]), unlist(d[,2]), unlist(d[,4]), unlist(d[,5]), unlist(d[,3]), unlist(d[,6]), unlist(d[,7]) ) # names(d)<-c("Blat.Score", "Gene", "Transcript", "Num.hit.genes.or.transcripts", "three.prime.dist", "exonoverlap.fraction") d<-data.frame(row.names=unlist(d[,1]), unlist(d[,2]), unlist(d[,3]), unlist(d[,4]), unlist(d[,5]), unlist(d[,6]) ) names(d)<-c("Blat.Score", "Gene", "Num.hit.genes.or.transcripts", "three.prime.dist", "exonoverlap.fraction") d } # Function to create a density plot of data. - a 2d histogram. hist.2d<-function(x, y, xbreaks=100, ybreaks=100, countmax=-1, col=pal, xlab=NULL, ylab=NULL, main=NULL, ylim=NULL, xlim=NULL) { # create a grid grid<-matrix(0, nrow=ybreaks, ncol=xbreaks); labRow<-NULL; labCol<-NULL; # set up ranges. minx<-min(x, na.rm=T); miny<-min(y, na.rm=T); maxx<-max(x, na.rm=T); maxy<-max(y, na.rm=T); if (is.vector(ylim) && length(ylim) > 0) { miny<-ylim[1]; maxy<-ylim[2]; } if (is.vector(xlim) && length(xlim) > 0) { minx<-xlim[1]; maxx<-xlim[2]; } xbinsize<-(maxx - minx)/xbreaks; ybinsize<-(maxy - miny)/ybreaks; # names<-seq(minx, max(x)-xbinsize, by=xbinsize); # rownames<-seq(miny, max(y)-ybinsize, by=ybinsize); # grid<-data.frame(grid, row.names=as.character(rownames)); # names(grid)<-as.character(names); # iterate over the data, filling in the grid i<-1; for(a in as.vector(x)) { if (!is.numeric(a) || is.na(a)) { next; } if (a < minx) { a<-minx; } if (a > maxx) { a<-maxx; } b<-as.vector(y)[i]; if (!is.numeric(b) || is.na(b)) { next; } if (b < miny) { b<-miny; } if (b > maxy) { b<-maxy; } abin<- floor(( a - minx )/xbinsize); bbin<- floor(( b - miny)/ybinsize); grid[abin,bbin]<-grid[abin,bbin]+1; if (countmax >0) { grid[abin,bbin] <- min(grid[abin,bbin], countmax); } i<-i+1; } # grid <- grid / length(x); print(max(grid)) heatmap(grid, Colv=NA, Rowv=NA, scale="none", col=col, xlab=ylab, ylab=ylab, main=main, labRow=labRow, labCol=labCol); # grid; }