# Tue Apr 19 14:36:08 EDT 2005 setwd("//adtera/projects/pavlidis/grp/collaborations/cincinnati/results/0305/final-analysis") # load(".RData") source("../Functions.R") # load data affy.data<-read.delim("rma-dilution-data.txt", row.names=1, header=T) illu.data.nonorm<-read.delim("100CP-nonorm-data.txt", row.names=1, header=T) # load manufacturer's annotations illu.mfg.annots<-data.frame(read.delim("../100CP-barnes.annots.txt", row.names=1, header=T)) affy.mfg.annots<-read.delim("//adtera/users2/projects/pavlidis/grp/databases/arrays/HG-U133_Plus_2_annot.csv", sep=',', row.names=1, as.is=T, header=T) affy.mfg.annots<-data.frame(affy.mfg.annots[,"Gene.Symbol"], row.names=row.names(affy.mfg.annots)) names(affy.mfg.annots)<-c("gene") names(illu.mfg.annots)<-c("gene") # load annotations and set up we made illu.tp.annots.raw<-unique(read.delim("../seqan/100CP_target.19Apr2005-1246.blat-annots.txt", header=F)); names(illu.tp.annots.raw)<-c("probe", "array", "Probe.Length", "Blat.Matches", "len", "Blat.Score", "Gene", "Transcript", "threeprimeDist", "exonoverlap", "chromosome", "targetStart", "targetEnd"); affy.tp.annots.raw<-unique(read.delim("../seqan/HG-U133_Plus_2_probeCollapsed.fa.19Apr2005-1033.blat-annots.txt", header=F)); names(affy.tp.annots.raw)<-c("probe", "array", "Probe.Length", "Blat.Matches", "len", "Blat.Score", "Gene", "Transcript", "threeprimeDist", "exonoverlap", "chromosome", "targetStart", "targetEnd"); # Tue Apr 26 07:34:26 EDT 2005 The R code to do the 'best' selection was far too slow. I replaced it with java code, output to new files: affy.tp.annots<-read.delim("../seqan/HG-U133_Plus_2_probeCollapsed.best.fa.19Apr2005-1033.blat-annots.txt", header=F, row.names=1) names(affy.tp.annots)<-c("array", "gene", "transcript", "Num.hit.genes.or.transcripts", "Blat.score", "exonoverlap", "threeprimeDist", "alignLength", "chromosome", "targetStart", "targetEnd") illu.tp.annots<-read.delim("../seqan/100CP_target.best.19Apr2005-1246.blat-annots.txt", header=F, row.names=1) names(illu.tp.annots)<-c("array", "gene", "transcript", "Num.hit.genes.or.transcripts", "Blat.score", "exonoverlap", "threeprimeDist", "alignLength", "chromosome", "targetStart", "targetEnd") write.table(affy.tp.annots, "affy.tp.annots.txt", sep='\t', quote=F) write.table(illu.tp.annots, "illu.tp.annots.txt", sep='\t', quote=F) affy.tp.annots<-read.delim("affy.tp.annots.txt", header=T, row.names=1) illu.tp.annots<-read.delim("illu.tp.annots.txt", header=T, row.names=1) ############################################ # Information for Table 1 dim(illu.mfg.annots) dim(affy.mfg.annots) dim(illu.tp.annots) known.genes<-read.table("../known.genes", header=T, as.is=T) illu.known.mfg.probes<-length( which(illu.mfg.annots[,"gene"] %in% known.genes[,"geneName"])); illu.known.mfg.genes<-length(unique((illu.mfg.annots[which(illu.mfg.annots[,"gene"] %in% known.genes[,"geneName"]),]))) illu.nonknown.mfg.probes<-length(which(!( illu.mfg.annots[,"gene"] %in% known.genes[,"geneName"])) illu.known.tp.probes<-length( which(illu.tp.annots[,"gene"] %in% known.genes[,"geneName"])); illu.known.tp.genes<-length(unique((illu.tp.annots[which(illu.tp.annots[,"gene"] %in% known.genes[,"geneName"]),"gene"]))) illu.nonknown.tp.probes<-length(which(!( illu.tp.annots[,"gene"] %in% known.genes[,"geneName"]))); affy.known.mfg.probes<-length(which(affy.mfg.annots[,"gene"] %in% known.genes[,"geneName"])); affy.known.mfg.genes<-length(unique((affy.mfg.annots[which(affy.mfg.annots[,"gene"] %in% known.genes[,"geneName"]),]))) affy.nonknown.mfg.probes<-length(which(!( affy.mfg.annots[,"gene"] %in% known.genes[,"geneName"]))); affy.known.tp.probes<-length(which(affy.tp.annots[,"gene"] %in% known.genes[,"geneName"])); affy.known.tp.genes<-length(unique((affy.tp.annots[which(affy.tp.annots[,"gene"] %in% known.genes[,"geneName"]),"gene"]))) affy.nonknown.tp.probes<-length(which(!( affy.tp.annots[,"gene"] %in% known.genes[,"geneName"]))); length(which(as.character(affy.mfg.annots[,"gene"]) == "---")) length(unique(illu.tp.annots.raw[,"probe"])) length(unique(affy.tp.annots.raw[,"probe"])) length(unique(cross.corr[,"gene"])) length(unique(cross.corr.mfg[,"gene"])) ########### end table 1 ################# # how many probes are close to the 3' end. length(which(affy.tp.annots[,"threeprimeDist"] < 300)) / dim(affy.tp.annots)[1] length(which(illu.tp.annots[,"threeprimeDist"] < 300)) / dim(illu.tp.annots)[1] #### Normalize the Illumina data. library(affy) data(affybatch.example) normalize.methods(affybatch.example) illu.data<-normalize.quantiles(as.matrix(illu.data.nonorm)); illu.data<-data.frame(illu.data, row.names=row.names(illu.data.nonorm)) names(illu.data)<-nam; write.table(illu.data, "../data/100CP-quantilenorm-data.txt"); illu.data<-read.table( "../data/100CP-quantilenorm-data.txt", header=T, row.names=1) illu.data.cn<-normalize.constant(as.matrix(illu.data.nonorm), 500) names(illu.data.cn)<-nam; write.table(illu.data.cn, "../data/100CP-constantnorm-data.txt"); #################################################### # Mon May 9 13:43:45 EDT 2005 # combined data file for clustering. comb.data<-cbind(t(scale(t(log2(affy.data[as.character(cross.corr[,"rprobe"]),])))), t(scale(t(illu.data[as.character(cross.corr[,"sprobe"]),])))) comb.rownames<-cbind(as.character(cross.corr[,"rprobe"]), as.character(cross.corr[,"sprobe"]), as.character(cross.corr[,"gene"])) comb.rownames.scrunch<-apply(comb.rownames, 1, function(x) { paste(x[1], x[2], x[3], sep=".")}) row.names(comb.data)<-comb.rownames.scrunch write.table(file="combined-data.txt", round(comb.data, 3), quote=F, sep="\t", na="") #################################################### # Wed Apr 27 11:57:32 EDT 2005 Basic correlation with dilution profile. corr<-function(x) { cor(x, dilution, use="pairwise.complete.obs") } affy.corr<-apply(2**affy.data, 1, corr); illu.corr<-apply(illu.data, 1, corr); ################################### # Signfiicance of correlation. corr.test.all<-function(x) {cor.test(~ x + dilution)$p.value;} affy.corr.p<-apply(2**affy.data, 1, corr.test.all); illu.corr.p<-apply(illu.data, 1, corr.test.all); library(qvalue) summary(qvalue(affy.corr.p)) summary(qvalue(illu.corr.p)) good.rows<-affy.corr.p[which(qvalue(affy.corr.p)$qvalue<0.05)] min(abs(affy.corr[names(good.rows)])) good.rows<-illu.corr.p[which(qvalue(illu.corr.p)$qvalue<0.05)] min(abs(illu.corr[names(good.rows)])) ################### FIGURE 3 ####################### ps.options(); postscript("../../../doc/illumina-affy/figure-3.ps", family="Helvetica", horizontal=F, width=8, height=8); par(mfrow=c(2,2), cex=1.2) a<-hist(illu.corr, breaks=100, plot=F) plot(col="black", a$mids, a$density, type="s", xlab="Dilution correlation", main="Illumina", ylab="Density", lw=2) mtext("A", side=3, adj=0, cex=3) a<-hist(affy.corr, breaks=100, plot=F) plot(col="black", a$mids, a$density, type="s", xlab="Dilution correlation", main="Affymetrix", ylab="Density", lw=2) mtext("B", side=3, adj=0, cex=3) # refseq/nonreseq a<-hist(illu.corr[ unique(illu.tp.annots.raw[,"probe"] ) ], breaks=200, col="grey", border="grey", xlab="Dilution correlation", plot=F) b<-hist(illu.corr[ which(!(names(illu.corr) %in% unique(illu.tp.annots.raw[,"probe"]) )) ], breaks=200, col="grey", border="grey", xlab="Dilution correlation", main="Illumina non-Known genes", plot=F) plot(col="darkgrey", a$mids, a$density, type="s", xlab="Dilution correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); legend(-0.8, 3, legend=c("Illumina known genes", "Illumina 'not-known'"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) mtext("C", side=3, adj=0, cex=3) a<-hist(affy.corr[ unique(affy.tp.annots.raw[,"probe"] ) ], breaks=200, col="grey", border="grey", xlab="Dilution correlation", plot=F) b<-hist(affy.corr[ which(!(names(affy.corr) %in% unique(affy.tp.annots.raw[,"probe"]) )) ], breaks=200, col="grey", border="grey", xlab="Dilution correlation", main="Affy non-Known genes", plot=F) plot(col="darkgrey", a$mids, a$density, type="s", xlab="Dilution correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,4), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,4), lw=2); legend(-0.8, 3, legend=c("Affymetrix known genes", "Affymetrix 'not-known'"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) mtext("D", side=3, adj=0, cex=3) dev.off(); ############################################ # effect of expression level on non-cross correlation affy.medians<-apply(2**affy.data, 1, median); illu.medians<-apply(illu.data, 1, median); ########### FIGURE 4 ################################# ps.options(); postscript("../../../doc/illumina-affy/figure-4.ps", family="Helvetica", horizontal=F, width=8, height=5); par(mfcol=c(1,2), cex=1.2) #### A ###### - illumina a<-hist(illu.corr[which(log2(illu.medians) < 8)], col="grey", border="grey", breaks=50, main="Illumina, Exp. < 8", xlab="Correlation", plot=F); b<-hist(illu.corr[which(log2(illu.medians) >= 8)], col="grey", border="grey", breaks=50, main="Illumina, Exp. > 8", xlab="Correlation", plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,5), lw=2); mtext("A", side=3, adj=0, cex=3) par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,5), lw=2); legend(-0.8, 3, legend=c("Illumina, Exp level < 8", "Illumina, Exp level > 8 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) #### B ##### - affymetrix a<-hist(affy.corr[which(log2(affy.medians) < 5)], col="grey", border="grey", breaks=50, main="Affymetrix, Expression level < 5", xlab="Correlation", plot=F); b<-hist(affy.corr[which(log2(affy.medians) >= 5)], col="grey", border="grey", breaks=50, main="Affymetrix, Expression level > 5", xlab="Correlation", plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,5), lw=2); mtext("B", side=3, adj=0, cex=3) par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,5), lw=2); legend(-0.8, 3, legend=c("Affymetrix, Exp level < 5", "Affymetrix, Exp level > 5 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) dev.off(); #### Supplementary figures ###### par(mfrow=c(1,2)) plot(illu.corr, log2(illu.medians), pch='.', xlab="Correlation", ylab="Median expression of probe", main="Illumina", ylim=c(5, 15)) plot(affy.corr, log2(affy.medians), pch='.', xlab="Correlation", ylab="Median expression of probe set", main="Affymetrix", ylim=c(0, 15)) # filtering out genes that don't show dilution (see below) ############################################ # Cross-correlation (log transformation is irrelevant if we use Spearman) cross.corr<-rs(rdataset=affy.data, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman" ); write.table(cross.corr, quote=F, file="cross.corr.txt", sep='\t') cross.corr<-read.table("cross.corr.txt", header=T, sep='\t', row.names=1) # pearson correlation, using log-transformed data. #cross.corr.pearson<-rs(rdataset=affy.data, sdataset=log2(illu.data), sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson" ); #write.table(cross.corr.pearson, quote=F, file="cross.corr.pearson.txt", sep='\t') #cross.corr.pearson<-read.table("cross.corr.pearson.txt", header=T, sep='\t', row.names=1) # pearson correlation, using non-log-transformed data. affy.data.exp<-2^affy.data names(affy.data.exp)<-names(affy.data) row.names(affy.data.exp)<-row.names(affy.data) affy.data.exp<-data.frame(affy.data.exp) cross.corr.pearson.nolog<-rs(rdataset=affy.data.exp, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson" ); write.table(cross.corr.pearson.nolog, quote=F, file="cross.corr.pearson.nolog.txt", sep='\t') cross.corr.pearson.nolog<-read.table("cross.corr.pearson.nolog.txt", header=T, sep='\t', row.names=1) # to get the pvalues cross.corr.pvalues<-rs(rdataset=affy.data, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman", test=T ); write.table(cross.corr.pvalues, quote=F, file="cross.corr.pvalues.txt", sep='\t') cross.corr.pvalues<-read.table("cross.corr.pvalues.txt", header=T, row.names=1) # pvalues for pearson, using exponentiated affy data. cross.corr.pvalues.pearson<-rs(rdataset=affy.data.exp, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson", test=T ); write.table(cross.corr.pvalues.pearson, quote=F, file="cross.corr.pvalues.pearson.txt", sep='\t') cross.corr.pvalues.pearson<-read.table("cross.corr.pvalues.pearson.txt", header=T, row.names=1) hist(cross.corr.pvalues.pearson[,"pvalue"]) plot(-log10(cross.corr.pvalues.pearson[,"pvalue"]), cross.corr.pvalues.pearson[,"correl"]) library(qvalue) summary(qvalue(cross.corr.pvalues.pearson[,"pvalue"])) # find the threshold correlation qvals<-qvalue(cross.corr.pvalues.pearson[,"pvalue"])$qvalues good.rows<-cross.corr.pvalues.pearson[which(qvals<0.05),] min(abs(good.rows[,"correl"])) # bonferroni: length(which ( (cross.corr.pvalues.pearson[,"pvalue"] * length(cross.corr.pvalues.pearson[,"pvalue"])) < 0.05)) / length(cross.corr.pvalues.pearson[,"pvalue"]) # Thu Aug 4 18:10:13 EDT 2005 # filter the results to remove probes which are not "significantly" diluted. Using non-log-transformed data. affy.corr.good.names<-names(affy.corr.p[ which(qvalue(affy.corr.p)$qvalue < 0.05)]) affy.corr.p.good<-affy.data.exp[names(affy.corr.p[ which(qvalue(affy.corr.p)$qvalue < 0.05)]),] illu.corr.good.names<-names(illu.corr.p[ which(qvalue(illu.corr.p)$qvalue < 0.05)]) illu.corr.p.good<-illu.data[names(illu.corr.p[ which(qvalue(illu.corr.p)$qvalue < 0.05)]),] cross.corr.pvalues.pearson.filtered<-rs(rdataset=affy.corr.p.good, sdataset=illu.corr.p.good, sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson", test=T ); write.table(cross.corr.pvalues.pearson.filtered, quote=F, file="cross.corr.pvalues.pearson.filtered.txt", sep='\t') cross.corr.pvalues.pearson.filtered<-read.table("cross.corr.pvalues.pearson.filtered.txt", header=T, row.names=1) cross.corr.pvalues.filtered<-rs(rdataset=affy.corr.p.good, sdataset=illu.corr.p.good, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman", test=T ); write.table(cross.corr.pvalues.filtered, quote=F, file="cross.corr.pvalues.filtered.txt", sep='\t') cross.corr.pvalues.filtered<-read.table("cross.corr.pvalues.filtered.txt", header=T, row.names=1) hist(cross.corr.pvalues.pearson.filtered[,"pvalue"]) summary(qvalue(cross.corr.pvalues.pearson.filtered[,"pvalue"])) # bonferroni: length(which ( (cross.corr.pvalues.pearson.filtered[,"pvalue"] * length(cross.corr.pvalues.pearson.filtered[,"pvalue"])) < 0.05)) / length(cross.corr.pvalues.pearson.filtered[,"pvalue"]) hist(cross.corr.pvalues.pearson.filtered[,"correl"]) # Thu Aug 4 18:16:44 EDT 2005 # looking at alignment length. affy.alignment.lengths<-read.table("../seqan/HG-U133_Plus_2_alignment_lengths.txt", header=F) illu.alignment.lengths<-read.table("../seqan/100CP_alignment_lengths.txt", header=F) names(illu.alignment.lengths)<-c("probe", "length") names(affy.alignment.lengths)<-c("probe", "length") median(affy.alignment.lengths[,2]) median(illu.alignment.lengths[,2]) affy.puzzling.probes<-read.table("affy.puzzling.probes.txt", header=F) illu.puzzling.probes<-read.table("illu.puzzling.probes.txt", header=F) fisher.test(rbind(c( length(which(illu.alignment.lengths[illu.alignment.lengths[,"probe"] %in% illu.puzzling.probes[,1],"length"] > 50)) , length(illu.puzzling.probes[,1]) ), c( length(which(illu.alignment.lengths[,2] > 50)) , length(illu.alignment.lengths[,1]) ))) fisher.test(rbind(c( length(which(affy.alignment.lengths[affy.alignment.lengths[,"probe"] %in% affy.puzzling.probes[,1],"length"] > 450)) , length(affy.puzzling.probes[,1]) ) , c( length(which(affy.alignment.lengths[,2] > 481)) , length(affy.alignment.lengths[,1]) ) )) # Mon Jul 25 14:09:04 EDT 2005 # cross hybridization # Perl script extracts high-quality blat matches to more than one site in the genome. Files in seqan (file names misleading) - see README affy.multiple.hits<-read.delim("../seqan/affy.analyzedHits", header=F) illu.multiple.hits<-read.delim("../seqan/illu.analyzedHits", header=F) fisher.test(rbind( c( length(which (affy.multiple.hits[,1] %in% affy.puzzling.probes[,1]) ) , length(affy.puzzling.probes[,1]) ) ,c( length(affy.multiple.hits[,1]) , length(affy.alignment.lengths[,1]) ) )); fisher.test(rbind( c( length(which (illu.multiple.hits[,1] %in% illu.puzzling.probes[,1]) ) , length(illu.puzzling.probes[,1]) ) ,c( length(illu.multiple.hits[,1]) , length(illu.alignment.lengths[,1]) ) )); par(mfrow=c(1,2)) a<-hist(log10(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2)) b<-hist(log10(affy.tp.annots[,"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2)) plot(col="darkgrey", a$mids, a$density, type="s", xlab="log(Number of aligned genes or transcripts)", ylab="Density", main="", xlim=c(0,2), ylim=c(0,2), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(0,2), ylim=c(0,2), lw=2); legend(1., 1, legend=c("Poor agreement", "Population"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=1.0) ks.test(hist(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], plot=F)$density, (hist(affy.tp.annots[,"Num.hit.genes.or.transcripts"], plot=F)$density)) mean(affy.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"]) mean(affy.tp.annots[,"Num.hit.genes.or.transcripts"], na.rm=T) a<-hist(log10(illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2)) b<-hist(log10(illu.tp.annots[,"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2)) plot(col="darkgrey", a$mids, a$density, type="s", xlab="Number of aligned genes or transcripts", ylab="Density", main="", xlim=c(0,2), ylim=c(0,2), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(0,2), ylim=c(0,2), lw=2); legend(1., 1, legend=c("Poor agreement", "Population"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=1.0) ks.test(hist(illu.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], plot=F)$density, (hist(illu.tp.annots[,"Num.hit.genes.or.transcripts"], plot=F)$density)) mean(illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"]) mean(illu.tp.annots[,"Num.hit.genes.or.transcripts"], na.rm=T) # comparing across the platforms for the "puzzling" probes: a<-hist(log10(affy.tp.annots[,"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2)) b<-hist(log10(illu.tp.annots[,"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2)) plot(col="darkgrey", a$mids, a$density, type="s", xlab="Number of aligned genes or transcripts", ylab="Density", main="", xlim=c(0,2), ylim=c(0,2), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(0,2), ylim=c(0,2), lw=2); legend(1.5, 1, legend=c("Affymetrix", "Illumina"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=1) ks.test(hist(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], plot=F)$density, (hist(illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], plot=F)$density)) hist(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"] - illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], breaks=200, xlim=c(-15,15)) # plot(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"] - illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], cross.corr[which(rprobe %in% affy.puzzling.probes[,1]),"correl"]) t.test(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], paired=T) cross.corr.pearson.nolog.pvalues<-rs(rdataset=affy.data.exp, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson", test=T ); write.table(cross.corr.pearson.nolog.pvalues, quote=F, file="cross.corr.pearson.nolog.pvalues.txt", sep='\t') corrected.pval<-cross.corr.pearson.nolog.pvalues[,"pvalue"] * length(cross.corr.pearson.nolog.pvalues[,"pvalue"]) # # find threshold for filtering data. affy.mean<-apply(affy.data, 1, mean) quantile(affy.mean) quantile(apply(illu.data, 1, mean), na.rm=T) # To repeat analysis with pearson correlation: # cross.corr<-cross.corr.pearson.nolog # Relation between expression and 3' location everything<-cbind(cross.corr, affy.tp.annots[match(cross.corr[,"rprobe"], row.names(affy.tp.annots)),"threeprimeDist"], illu.tp.annots[match(cross.corr[,"sprobe"], row.names(illu.tp.annots)),"threeprimeDist"] , affy.tp.annots[match(cross.corr[,"rprobe"], row.names(affy.tp.annots)),"targetStart"] , affy.tp.annots[match(cross.corr[,"rprobe"], row.names(affy.tp.annots)),"targetEnd"] , illu.tp.annots[match(cross.corr[,"sprobe"], row.names(illu.tp.annots)),"targetStart"] , illu.tp.annots[match(cross.corr[,"sprobe"], row.names(illu.tp.annots)),"targetEnd"] ) #everything<-cbind(everything, min(abs(everything[,"affy.targ.start"] - #everything[,"illu.targ.start"]),abs(everything[,"affy.targ.end"] - #everything[,"illu.targ.end"]),abs(everything[,"affy.targ.start"] - #everything[,"illu.targ.end"]),abs(everything[,"affy.targ.start"] - #everything[,"illu.targ.end"] ))) names(everything)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist", "affy.targ.start", "affy.targ.end", "illu.targ.start", "illu.targ.end") attach(everything) diff<- abs(((everything[,"affy.targ.end"] - everything[,"affy.targ.start"])/2 + affy.targ.start) - ((everything[,"illu.targ.end"] - everything[,"illu.targ.start"])/2 + illu.targ.start)); everything<-cbind(everything,diff) names(everything)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist", "affy.targ.start", "affy.targ.end", "illu.targ.start", "illu.targ.end", "diff") # Thu May 12 10:42:39 EDT 2005 # in the cross-platform agreement, are the disagreement among genes # that do not look like the dilution profile in either platform? affy.corr.ev<-affy.corr[as.character(everything[,"affyprobe"])] illu.corr.ev<-illu.corr[as.character(everything[,"illuprobe"])] everything<-data.frame(everything, affy.corr.ev, illu.corr.ev, pmax(abs(affy.corr.ev), abs(illu.corr.ev))) names(everything)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist", "affy.targ.start", "affy.targ.end", "illu.targ.start", "illu.targ.end", "diff", "aff.corr", "illu.corr", "max.corr.single") attach(everything) write.table(file="everything.txt", everything, sep='\t', quote=F) everything<-read.delim("everything.txt", header=T, row.names=1) ############################################ ### FIGURE 5 ######### ps.options() postscript("../../../doc/illumina-affy/figure-5.ps", family="Helvetica", horizontal=T, width=10, height=4.5); par(mfrow=c(1, 3), cex=1.2) ### A ##### a<-hist(correl, col="grey", breaks=50, border="grey", main="Cross-platform comparison of profiles", xlab="Correlation" , xlim=c(-1,1), plot=F) plot(col="black", a$mids, a$density, type="s", xlab="Correlation cross-platform,\nall genes", ylab="Density", main="", lw=2); mtext("A", side=3, adj=0, cex=3) ### B #### highmax<-which(max.corr.single > 0.8) a<-hist(correl[highmax], breaks=50, plot=F) b<-hist(correl[-highmax], breaks=50, plot=F) plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation cross-platform", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,5), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,5), lw=2); legend(-0.8, 3, legend=c("High dilution effect (>0.8)", "Low dilution effect (<0.8)"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) mtext("B", side=3, adj=0, cex=3) ### C #### psg<-read.table("../placenta-genes") a<-hist(cross.corr[which(cross.corr[,"gene"] %in% psg[,"V1"]),"correl"], col="grey", border="grey", breaks=50, main="Agreement cross-platform for placenta/blood genes", xlab="Correlation", xlim=c(-1,1), plot=F) plot(col="black", a$mids, a$density, type="s", xlab="Correlation cross-platform,\nplacenta/blood genes", ylab="Density", main="", xlim=c(-1,1), lw=2, cex=0.5); mtext("C", side=3, adj=0, cex=3) dev.off(); ### supplement plot(max.corr.single, correl, pch=20, cex=0.5, xlab="Best single-data set dilution profile correlation", ylab="Cross-platform correlation", main="All probes"); ############# FIGURE 6 - effect of expression level ##################3 attach(everything) ps.options() postscript("../../../doc/illumina-affy/figure-6.ps", family="Helvetica", horizontal=F, width=8, height=5); par(mfrow=c(1, 2), cex=1.2) #### A ##### a<-hist(correl[which(explev.affy<5)], breaks=50, plot=F); b<-hist(correl[which(explev.affy>=5)], breaks=50, plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Cross-platform correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); mtext("A", side=3, adj=0, cex=3) legend(-0.8, 3, legend=c("log2 Exp level < 5", "log2 Exp level > 5 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) ### B ##### a<-hist(correl[which(log10(diff) <= 2.69)], breaks=50, plot=F); b<-hist(correl[which(log10(diff) > 2.69)], breaks=50, plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Cross-platform correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); mtext("B", side=3, adj=0, cex=3) legend(-0.8, 3, legend=c("Probe spacing < 300bp", "Probe spacing > 300bp"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) dev.off(); # Supplement plot(explev.affy, jitter(correl), pch=20, cex=0.2, xlab="Expression level", ylab="Correlation", main="Effect of expression level on cross-platform agreement") plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))") c<-hist(correl[explev.affy<5 & (illuprobe %in% illu.corr.good.names | affyprobe %in% affy.corr.good.names)], breaks=50, plot=F); d<-hist(correl[explev.affy>=5 & (illuprobe %in% illu.corr.good.names | affyprobe %in% affy.corr.good.names)], breaks=50, plot=F); plot(col="darkgrey", c$mids, c$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); par(new=T); plot(col="black", d$mids, d$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); legend(-0.8, 3, legend=c("Exp level < 5, sig. dilution effect", "Exp level > 5, sig dilution effect "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) a<-hist(correl[log10(diff) <= 2.69 & (illuprobe %in% illu.corr.good.names | affyprobe %in% affy.corr.good.names)], breaks=50, plot=F); b<-hist(correl[log10(diff) > 2.69 & (illuprobe %in% illu.corr.good.names | affyprobe %in% affy.corr.good.names)], breaks=50, plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Cross-platform correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); legend(-0.8, 3, legend=c("Probe spacing < 300bp, diluton effect", "Probe spacing > 300bp, dilution effect"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) ############################################### # statistics for the text. # number of mappable affy probes length(unique(everything[,"affyprobe"])) # number of mappable illumina probes length(unique(everything[,"illuprobe"])) #number of mapped genes length(unique(everything[,"gene"])) # percent of probes with corrleations > 0.8 with dilution profile length(which(abs(affy.corr) > 0.8))/length(affy.corr) length(which(abs(illu.corr) > 0.8))/length(illu.corr) 30730/length(affy.corr) 12041/length(illu.corr) ################ MORE SUPPLEMENTARY DATA ###################################### #### pairwise plots of all the data plot(illu.data, pch='.') plot(log2(illu.data), pch='.') plot(affy.data, pch='.') plot(2**(affy.data), pch='.') # Tue May 3 22:09:41 EDT 2005 # using the manufacturer's annotations. cross.corr.mfg<-rs(rdataset=affy.data, sdataset=illu.data, sarray=illu.mfg.annots, rarray=affy.mfg.annots, method="spearman" ); write.table(cross.corr.mfg, quote=F, file="cross.corr.mfg.txt", sep='\t') attach(cross.corr.mfg) par(mfrow=c(1,2)) hist(correl, breaks=50, col="grey", border="grey", main="Cross-platform comparison of profiles \n(Mfg. annots)", xlab="Correlation" , xlim=c(-1,1)) hist(cross.corr[which(cross.corr[,"gene"] %in% psg[,"V1"]),"correl"], col="grey", border="grey", breaks=50, main="Agreement cross-platform for placenta/blood genes \n(Mfg. annots)", xlab="Correlation", xlim=c(-1,1)) # put all 4 on one graph par(mfrow=c(2,2)) attach(cross.corr) hist(correl, breaks=50, col="grey", border="grey", main="Cross-platform comparison of profiles", xlab="Correlation" , xlim=c(-1,1)) hist(cross.corr[which(cross.corr[,"gene"] %in% psg[,"V1"]),"correl"], col="grey", border="grey", breaks=50, main="Agreement cross-platform for placenta/blood genes", xlab="Correlation", xlim=c(-1,1)) attach(cross.corr.mfg) hist(correl, breaks=50, col="grey", border="grey", main="Cross-platform comparison of profiles \n(Mfg. annots)", xlab="Correlation" , xlim=c(-1,1)) hist(cross.corr[which(cross.corr[,"gene"] %in% psg[,"V1"]),"correl"], col="grey", border="grey", breaks=50, main="Agreement cross-platform for placenta/blood genes \n(Mfg. annots)", xlab="Correlation", xlim=c(-1,1)) attach(cross.corr) ### Three prime location of probes. (all) par(mfrow=c(1,2)) hist(log10(affy.tp.annots.raw[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="distance from 3' end of gene (log10 bases)", main="Affymetrix probe locations", xlim=c(0,8), freq=F) hist(log10(illu.tp.annots.raw[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="distance from 3' end of gene (log10 bases)", main="Illumina probe locations", xlim=c(0,8), freq=F) # Best par(mfrow=c(1,2)) hist(log10(affy.tp.annots[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="Best distance from 3' end of gene (log10 bases)", main="Affymetrix probe locations", xlim=c(0,7), freq=F) hist(log10(illu.tp.annots[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="Best distance from 3' end of gene (log10 bases)", main="Illumina probe locations", xlim=c(0,7), freq=F) # Exon overlaps (not very interesing) par(mfrow=c(1,2)) hist(illu.tp.annots[,"exonoverlap"], col="grey", border="grey", breaks=50, xlab="Exon overlap, fraction of probe length", main="Illumina"); hist(affy.tp.annots[,"exonoverlap"], col="grey", border="grey", breaks=50, xlab="Exon overlap, fraction of probe length", main="Affymetrix"); # Effect of overlap on signal par(mfcol=c(2,2)) hist(log2(apply(illu.data[row.names(illu.tp.annots)[which(illu.tp.annots[,"exonoverlap"] ==1.0)],], 1, mean)), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap = 1.0", xlab="log2 Median expression") hist(log2(apply(illu.data[row.names(illu.tp.annots)[which(illu.tp.annots[,"exonoverlap"] <1.0)],], 1, mean)), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap <1.0", xlab="log 2Median expression") hist(apply(affy.data[row.names(affy.tp.annots)[which(affy.tp.annots[,"exonoverlap"] ==1.0)],], 1, mean), breaks=200, border="grey", col="grey", main="Affy, Exon overlap = 1.0", xlab="Median expression") hist(apply(affy.data[row.names(affy.tp.annots)[which(affy.tp.annots[,"exonoverlap"] <1.0)],], 1, mean), breaks=200, border="grey", col="grey", main="Affy, Exon overlap <1.0", xlab="Median expression") # Relation between exon overlap and threeprime dist -- in the RAW annotations. par(mfcol=c(2,2)) hist(log10(illu.tp.annots.raw[,"threeprimeDist"][which(illu.tp.annots.raw[,"exonoverlap"]/illu.tp.annots.raw[,"Probe.Length"] == 1.0)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap=1", xlim=c(0,6), ylim=c(0,2000), xlab="log10(3'distance)") hist(log10(illu.tp.annots.raw[,"threeprimeDist"][which(illu.tp.annots.raw[,"exonoverlap"]/illu.tp.annots.raw[,"Probe.Length"] < 1)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap<1", xlim=c(0,6), ylim=c(0,2000), xlab="log10(3'distance)") hist(log10(affy.tp.annots.raw[,"threeprimeDist"][which(affy.tp.annots.raw[,"exonoverlap"]/affy.tp.annots.raw[,"Probe.Length"] == 1)]), breaks=200, border="grey", col="grey", main="Exon overlap=1", xlim=c(0,6), ylim=c(0,2000), xlab="log10(3'distance)") hist(log10(affy.tp.annots.raw[,"threeprimeDist"][which(affy.tp.annots.raw[,"exonoverlap"]/affy.tp.annots.raw[,"Probe.Length"] < 1)]), breaks=200, border="grey", col="grey", main="Exon overlap<1", xlim=c(0,6), ylim=c(0,2000), xlab="log10(3'distance)") # same, but for BEST par(mfcol=c(2,2)) hist(log10(illu.tp.annots[,"threeprimeDist"][which(illu.tp.annots[,"exonoverlap"] == 1.0)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap=1", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)") hist(log10(illu.tp.annots[,"threeprimeDist"][which(illu.tp.annots[,"exonoverlap"] < 1)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap<1", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)") hist(log10(affy.tp.annots[,"threeprimeDist"][which(affy.tp.annots[,"exonoverlap"] == 1)]), breaks=200, border="grey", col="grey", main="Affymetrix, Exon overlap=1 (best hits)", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)") hist(log10(affy.tp.annots[,"threeprimeDist"][which(affy.tp.annots[,"exonoverlap"] < 1)]), breaks=200, border="grey", col="grey", main="Affymetrix, Exon overlap<1 (best hits)", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)") # relation of threeprime distance with profile correlation - very little to see. plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="abs(log10(distance between probes across platforms))") # probe location effect, more # for genes expressed at high levels highexp<-which(explev.affy > 7); plot(log10(diff[highexp]), correl[highexp], pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))") a<-hist(correl[which(log10(diff) <= 3 & explev.affy > 7)], breaks=50, col="grey", border="grey", xlab="Dilution correlation, probe spacing < 300bp", main="", plot=F); b<-hist(correl[which(log10(diff) > 3 & explev.affy > 7)], breaks=50, col="grey", border="grey", xlab="Dilution correlation, probe spacing > 300bp", main="", plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,8), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,8), lw=2); legend(-0.8, 3, legend=c("Dilution correlation, probe spacing < 300bp, high expression", "Dilution correlation, probe spacing > 300bp, high expression"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2) cor.test(log10(diff[highexp]), correl[highexp], method="spearman") # linear model -- not so good! meanexp<-(explev.affy + log2(explev.illu))/2 zcorrel<-(log(1+correl)-(log(1-correl)))/2 logdiff<-log10(diff+1); use<-which(!is.na(logdiff)) aov.res<-lm(correl ~ diff + meanexp) summary(aov.res) # Wed May 4 14:36:14 EDT 2005 # computing fold changes foldchange<-function(x, islog=FALSE) { low<-mean(x[1:2]); high<-mean(x[11:12]); foldchange<-0; if (islog) { foldchange<-low - high; } else { foldchange<-log2(low/high) } foldchange; } affy.foldchange<-apply(affy.data, 1, foldchange, islog=T) illu.foldchange<-apply(illu.data, 1, foldchange, islog=F); ############################################ #par(mfrow=c(2, 1)) a<-hist(illu.foldchange, breaks=50, freq=FALSE, plot=F ) b<-hist(affy.foldchange, breaks=50, freq=FALSE, plot=F ) plot(a$mids, log(a$density), type="s", ylab="log10(Density)", main="Fold changes compared", xlab="Fold change (log2 units)", xlim=c(-10,10), ylim=c(-10,1.5), col="darkgrey", lw=2) par(new=T) plot(b$mids, log(b$density), type="s", main="", xlab="", ylab="", xlim=c(-10,10), ylim=c(-10,1.5), col="black", lw=2) legend(-0.8, 3, legend=c("Illumina", "Affymetrix"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2) foldchange.diff<-rs(rdataset=affy.foldchange, sdataset=illu.foldchange, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman", f=function(x,y){x-y;} ); hist(foldchange.diff[,4], col="grey", border="grey", breaks=50, main="Difference between platforms", xlab="difference in log2 fold change") # Wed May 11 21:32:40 EDT 2005 # Examine 3' distance using middle. affy.mid.annots<-read.delim("../seqan/HG-U133_Plus_2_probeCollapsed.best.fa.19Apr2005-1033.blat-annots.center.txt", header=F, row.names=1) names(affy.mid.annots)<-c("array", "gene", "transcript", "Num.hit.genes.or.transcripts", "Blat.score", "exonoverlap", "threeprimeDist") illu.mid.annots<-read.delim("../seqan/100CP_target.best.19Apr2005-1246.blat-annots.center.txt", header=F, row.names=1) names(illu.mid.annots)<-c("array", "gene", "transcript", "Num.hit.genes.or.transcripts", "Blat.score", "exonoverlap", "threeprimeDist") #write.table(affy.mid.annots, "affy.mid.annots.txt", sep='\t', quote=F) #write.table(illu.mid.annots, "illu.mid.annots.txt", sep='\t', quote=F) affy.mid.annots<-read.delim("affy.mid.annots.txt", header=T, row.names=1) illu.mid.annots<-read.delim("illu.mid.annots.txt", header=T, row.names=1) par(mfcol=c(2,2)) hist(log10(illu.mid.annots[,"threeprimeDist"][which(illu.mid.annots[,"exonoverlap"] == 1.0)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap=1", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)") hist(log10(illu.mid.annots[,"threeprimeDist"][which(illu.mid.annots[,"exonoverlap"] < 1)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap<1", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)") hist(log10(affy.mid.annots[,"threeprimeDist"][which(affy.mid.annots[,"exonoverlap"] == 1)]), breaks=200, border="grey", col="grey", main="Affymetrix, Exon overlap=1 (best hits)", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)") hist(log10(affy.mid.annots[,"threeprimeDist"][which(affy.mid.annots[,"exonoverlap"] < 1)]), breaks=200, border="grey", col="grey", main="Affymetrix, Exon overlap<1 (best hits)", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)") par(mfrow=c(1,2)) hist(log10(affy.mid.annots[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="Best center distance from 3' end of gene (log10 bases)", main="Affymetrix probe locations", xlim=c(0,7), freq=F) hist(log10(illu.mid.annots[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="Best center distance from 3' end of gene (log10 bases)", main="Illumina probe locations", xlim=c(0,7), freq=F) everything.mid<-cbind(cross.corr, affy.mid.annots[match(cross.corr[,"rprobe"], row.names(affy.mid.annots)),"threeprimeDist"], illu.mid.annots[match(cross.corr[,"sprobe"], row.names(illu.mid.annots)),"threeprimeDist"]) names(everything.mid)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist") everything.mid<-cbind(everything.mid, abs(everything.mid[,"affy.dist"] - everything.mid[,"illu.dist"])) names(everything.mid)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist", "diff") attach(everything.mid) plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))") a<-hist(correl[which(log10(diff) <= 3)], breaks=50, col="grey", border="grey", xlab="Dilution correlation, probe spacing < 300bp", main="", plot=F); b<-hist(correl[which(log10(diff) > 3)], breaks=50, col="grey", border="grey", xlab="Dilution correlation, probe spacing > 300bp", main="", plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,5), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,5), lw=2); legend(-0.8, 3, legend=c("Dilution correlation, probe spacing < 300bp", "Dilution correlation, probe spacing > 300bp"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2) # Thu May 12 13:33:34 EDT 2005 combining selection criteria....not so successful. reallygood<-which((max.corr.single > 0.8) & (explev.affy > 5) ) a<-hist(correl[reallygood], breaks=103) b<-hist(correl[-reallygood], breaks=103) plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,8), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,8), lw=2); legend(-0.8, 3, legend=c("Probes meeting all three criteria", "Everything else"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2) save.image(); # Mon Jul 25 16:48:28 EDT 2005 # extracting 3' probe. See README in ../../../data/0305/Affy. Trim and rearrange columns by hand. affy.3pdata<-read.table("../../../data/0305/Affy/affy.3pm.data.txt", header=T, row.names=1) affy.3p.corr<-apply(affy.3pdata, 1, corr); cross.3pa.corr<-rs(rdataset=affy.3pdata, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman" ); write.table(cross.3pa.corr, quote=F, file="cross.3pa.corr.txt", sep='\t') cross.3pa.corr<-read.table("cross.3pa.corr.txt", header=T, row.names=1); # Fri Aug 5 09:24:38 EDT 2005 # within-dataset comparisons illu.cross.within<-cross.corr.within(illu.data, illu.tp.annots) write.table(illu.cross.within, "illu.cross.within.txt", quote=F, sep='\t') affy.cross.within<-cross.corr.within(affy.data, affy.tp.annots) write.table(affy.cross.within, "affy.cross.within.txt", quote=F, sep='\t') illu.cross.within<-read.table("illu.cross.within.txt", header=T, row.names=1) affy.cross.within<-read.delim("affy.cross.within.txt", header=T, row.names=1, sep='\t') # useful statistics: # fraction of agreement over 0.8 length(which(illu.cross.within[,"correl"] > 0.8)) / length(illu.cross.within[,"correl"]) length(which(affy.cross.within[,"correl"] > 0.8)) / length(affy.cross.within[,"correl"]) ## supplementary figures: hist(illu.cross.within[,"correl"] , breaks=50, col="grey", border="grey", main="Within-platform comparison of profiles (Illumina)", xlab="Correlation" , xlim=c(-1,1)) hist(affy.cross.within[,"correl"] , breaks=50, col="grey", border="grey", main="Within-platform comparison of profiles (Affymetrix)", xlab="Correlation" , xlim=c(-1,1)) # make combined data structure so we can complete analyses easily like for the full data. ## Illumina attach(illu.cross.within); everything.illu.within<-cbind(illu.cross.within, illu.tp.annots[match(illu.cross.within[,"rprobe"], row.names(illu.tp.annots)),"threeprimeDist"], illu.tp.annots[match(illu.cross.within[,"sprobe"], row.names(illu.tp.annots)),"threeprimeDist"] , illu.tp.annots[match(illu.cross.within[,"rprobe"], row.names(illu.tp.annots)),"targetStart"] , illu.tp.annots[match(illu.cross.within[,"rprobe"], row.names(illu.tp.annots)),"targetEnd"] , illu.tp.annots[match(illu.cross.within[,"sprobe"], row.names(illu.tp.annots)),"targetStart"] , illu.tp.annots[match(illu.cross.within[,"sprobe"], row.names(illu.tp.annots)),"targetEnd"] ) names(everything.illu.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end") attach(everything.illu.within) diff<- abs(((everything.illu.within[,"r.targ.end"] - everything.illu.within[,"r.targ.start"])/2 + r.targ.start) - ((everything.illu.within[,"s.targ.end"] - everything.illu.within[,"s.targ.start"])/2 + s.targ.start)); everything.illu.within<-cbind(everything.illu.within,diff) names(everything.illu.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end", "diff") r.corr.ev<-illu.corr[as.character(everything.illu.within[,"rprobe"])] s.corr.ev<-illu.corr[as.character(everything.illu.within[,"sprobe"])] everything.illu.within<-data.frame(everything.illu.within, r.corr.ev, s.corr.ev, pmax(abs(r.corr.ev), abs(s.corr.ev))) names(everything.illu.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end", "diff", "r.corr", "s.corr", "max.corr.single") attach(everything.illu.within) write.table(file="everything.illu.within.txt", everything.illu.within, sep='\t', quote=F) ### AFFY attach(affy.cross.within); everything.affy.within<-cbind(affy.cross.within, affy.tp.annots[match(affy.cross.within[,"rprobe"], row.names(affy.tp.annots)),"threeprimeDist"], affy.tp.annots[match(affy.cross.within[,"sprobe"], row.names(affy.tp.annots)),"threeprimeDist"] , affy.tp.annots[match(affy.cross.within[,"rprobe"], row.names(affy.tp.annots)),"targetStart"] , affy.tp.annots[match(affy.cross.within[,"rprobe"], row.names(affy.tp.annots)),"targetEnd"] , affy.tp.annots[match(affy.cross.within[,"sprobe"], row.names(affy.tp.annots)),"targetStart"] , affy.tp.annots[match(affy.cross.within[,"sprobe"], row.names(affy.tp.annots)),"targetEnd"] ) names(everything.affy.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end") attach(everything.affy.within) diff<- abs(((everything.affy.within[,"r.targ.end"] - everything.affy.within[,"r.targ.start"])/2 + r.targ.start) - ((everything.affy.within[,"s.targ.end"] - everything.affy.within[,"s.targ.start"])/2 + s.targ.start)); everything.affy.within<-cbind(everything.affy.within,diff) names(everything.affy.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end", "diff") r.corr.ev<-affy.corr[as.character(everything.affy.within[,"rprobe"])] s.corr.ev<-affy.corr[as.character(everything.affy.within[,"sprobe"])] everything.affy.within<-data.frame(everything.affy.within, r.corr.ev, s.corr.ev, pmax(abs(r.corr.ev), abs(s.corr.ev))) names(everything.affy.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end", "diff", "r.corr", "s.corr", "max.corr.single") attach(everything.affy.within) write.table(file="everything.affy.within.txt", everything.affy.within, sep='\t', quote=F) ## analysis of the above within-dataset agreements. attach(everything.illu.within) # full data par(mfrow=c(1, 2), cex=1.2) plot(log2(explev.s), jitter(correl), pch=20, cex=0.2, xlab="Expression level", ylab="Correlation") plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))") par(mfrow=c(1, 2), cex=1.2) #### A ##### a<-hist(correl[which(explev.s<5)], breaks=50, plot=F); b<-hist(correl[which(explev.s>=5)], breaks=50, plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Cross-platform correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); mtext("A", side=3, adj=0, cex=3) legend(-0.8, 3, legend=c("log2 Exp level < 5", "log2 Exp level > 5 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) ### B ##### a<-hist(correl[which(log10(diff) <= 2.69)], breaks=50, plot=F); b<-hist(correl[which(log10(diff) > 2.69)], breaks=50, plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Within-platform correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); mtext("B", side=3, adj=0, cex=3) legend(-0.8, 3, legend=c("Probe spacing < 300bp", "Probe spacing > 300bp"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) attach(everything.affy.within) # full data par(mfrow=c(1, 2), cex=1.2) plot(explev.s, jitter(correl), pch=20, cex=0.2, xlab="Expression level", ylab="Correlation") plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))") par(mfrow=c(1, 2), cex=1.2) #### A ##### a<-hist(correl[which(explev.s<5)], breaks=50, plot=F); b<-hist(correl[which(explev.s>=5)], breaks=50, plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Within-platform correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); mtext("A", side=3, adj=0, cex=3) legend(-0.8, 3, legend=c("log2 Exp level < 5", "log2 Exp level > 5 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) ### B ##### a<-hist(correl[which(log10(diff) <= 2.69)], breaks=50, plot=F); b<-hist(correl[which(log10(diff) > 2.69)], breaks=50, plot=F); plot(col="darkgrey", a$mids, a$density, type="s", xlab="Within-platform correlation", ylab="Density", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); par(new=T); plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2); mtext("B", side=3, adj=0, cex=3) legend(-0.8, 3, legend=c("Probe spacing < 300bp", "Probe spacing > 300bp"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5) affy.puzzling.within.probes<-read.table("affy.puzzling.within.probes.txt", header=F) illu.puzzling.within.probes<-read.table("illu.puzzling.within.probes.txt", header=F) length(which(as.character(affy.puzzling.within.probes[,1]) %in% as.character(affy.puzzling.probes[,1]))) length(which(as.character(illu.puzzling.within.probes[,1]) %in% as.character(illu.puzzling.probes[,1]))) fisher.test(rbind(c(3,10),c(940,36023))) fisher.test(rbind(c(37,103),c(940,36023)))