# Tue Apr 19 14:36:08 EDT 2005
setwd("//adtera/projects/pavlidis/grp/collaborations/cincinnati/results/0305/final-analysis")
# load(".RData")
source("../Functions.R")

# load data
affy.data<-read.delim("rma-dilution-data.txt", row.names=1, header=T)
illu.data.nonorm<-read.delim("100CP-nonorm-data.txt", row.names=1, header=T)

# load manufacturer's annotations
illu.mfg.annots<-data.frame(read.delim("../100CP-barnes.annots.txt", row.names=1, header=T))
affy.mfg.annots<-read.delim("//adtera/users2/projects/pavlidis/grp/databases/arrays/HG-U133_Plus_2_annot.csv", sep=',', row.names=1, as.is=T, header=T)
affy.mfg.annots<-data.frame(affy.mfg.annots[,"Gene.Symbol"], row.names=row.names(affy.mfg.annots))
names(affy.mfg.annots)<-c("gene")
names(illu.mfg.annots)<-c("gene")


# load annotations and set up we made
illu.tp.annots.raw<-unique(read.delim("../seqan/100CP_target.19Apr2005-1246.blat-annots.txt", header=F));
names(illu.tp.annots.raw)<-c("probe", "array", "Probe.Length", "Blat.Matches", "len", "Blat.Score", "Gene", "Transcript", "threeprimeDist", "exonoverlap", "chromosome", "targetStart", "targetEnd");
affy.tp.annots.raw<-unique(read.delim("../seqan/HG-U133_Plus_2_probeCollapsed.fa.19Apr2005-1033.blat-annots.txt", header=F));
names(affy.tp.annots.raw)<-c("probe", "array", "Probe.Length", "Blat.Matches", "len", "Blat.Score", "Gene", "Transcript", "threeprimeDist", "exonoverlap", "chromosome", "targetStart", "targetEnd");

# Tue Apr 26 07:34:26 EDT 2005 The R code to do the 'best' selection was far too slow. I replaced it with java code, output to new files:
affy.tp.annots<-read.delim("../seqan/HG-U133_Plus_2_probeCollapsed.best.fa.19Apr2005-1033.blat-annots.txt", header=F, row.names=1)
names(affy.tp.annots)<-c("array", "gene", "transcript", "Num.hit.genes.or.transcripts", "Blat.score", "exonoverlap", "threeprimeDist", "alignLength", "chromosome", "targetStart", "targetEnd")
illu.tp.annots<-read.delim("../seqan/100CP_target.best.19Apr2005-1246.blat-annots.txt", header=F, row.names=1)
names(illu.tp.annots)<-c("array", "gene", "transcript", "Num.hit.genes.or.transcripts", "Blat.score", "exonoverlap", "threeprimeDist", "alignLength", "chromosome", "targetStart", "targetEnd")
write.table(affy.tp.annots, "affy.tp.annots.txt", sep='\t', quote=F)
write.table(illu.tp.annots, "illu.tp.annots.txt", sep='\t', quote=F)
affy.tp.annots<-read.delim("affy.tp.annots.txt", header=T, row.names=1)
illu.tp.annots<-read.delim("illu.tp.annots.txt", header=T, row.names=1)
############################################
# Information for Table 1
dim(illu.mfg.annots)
dim(affy.mfg.annots)
dim(illu.tp.annots)
known.genes<-read.table("../known.genes", header=T, as.is=T)
illu.known.mfg.probes<-length( which(illu.mfg.annots[,"gene"] %in% known.genes[,"geneName"]));
illu.known.mfg.genes<-length(unique((illu.mfg.annots[which(illu.mfg.annots[,"gene"] %in% known.genes[,"geneName"]),])))
illu.nonknown.mfg.probes<-length(which(!( illu.mfg.annots[,"gene"] %in% known.genes[,"geneName"]))

illu.known.tp.probes<-length( which(illu.tp.annots[,"gene"] %in% known.genes[,"geneName"]));
illu.known.tp.genes<-length(unique((illu.tp.annots[which(illu.tp.annots[,"gene"] %in% known.genes[,"geneName"]),"gene"])))
illu.nonknown.tp.probes<-length(which(!( illu.tp.annots[,"gene"] %in% known.genes[,"geneName"])));

affy.known.mfg.probes<-length(which(affy.mfg.annots[,"gene"] %in% known.genes[,"geneName"]));
affy.known.mfg.genes<-length(unique((affy.mfg.annots[which(affy.mfg.annots[,"gene"] %in% known.genes[,"geneName"]),])))
affy.nonknown.mfg.probes<-length(which(!( affy.mfg.annots[,"gene"] %in% known.genes[,"geneName"])));

affy.known.tp.probes<-length(which(affy.tp.annots[,"gene"] %in% known.genes[,"geneName"]));
affy.known.tp.genes<-length(unique((affy.tp.annots[which(affy.tp.annots[,"gene"] %in% known.genes[,"geneName"]),"gene"])))
affy.nonknown.tp.probes<-length(which(!( affy.tp.annots[,"gene"] %in% known.genes[,"geneName"])));

length(which(as.character(affy.mfg.annots[,"gene"]) == "---"))

length(unique(illu.tp.annots.raw[,"probe"]))
length(unique(affy.tp.annots.raw[,"probe"]))                                 
                                 
length(unique(cross.corr[,"gene"]))
length(unique(cross.corr.mfg[,"gene"]))                                 
########### end table 1 #################
# how many probes are close to the 3' end.                                 
length(which(affy.tp.annots[,"threeprimeDist"] < 300)) / dim(affy.tp.annots)[1]
length(which(illu.tp.annots[,"threeprimeDist"] < 300)) / dim(illu.tp.annots)[1]
                                 
#### Normalize the Illumina data.
library(affy)
data(affybatch.example)
normalize.methods(affybatch.example)
illu.data<-normalize.quantiles(as.matrix(illu.data.nonorm));
illu.data<-data.frame(illu.data, row.names=row.names(illu.data.nonorm))
names(illu.data)<-nam;
                                 write.table(illu.data, "../data/100CP-quantilenorm-data.txt");
                                 illu.data<-read.table( "../data/100CP-quantilenorm-data.txt", header=T, row.names=1)
                                 
illu.data.cn<-normalize.constant(as.matrix(illu.data.nonorm), 500)
names(illu.data.cn)<-nam;
write.table(illu.data.cn, "../data/100CP-constantnorm-data.txt");

                                 
####################################################
# Mon May  9 13:43:45 EDT 2005
# combined data file for clustering.
comb.data<-cbind(t(scale(t(log2(affy.data[as.character(cross.corr[,"rprobe"]),])))), t(scale(t(illu.data[as.character(cross.corr[,"sprobe"]),]))))
comb.rownames<-cbind(as.character(cross.corr[,"rprobe"]), as.character(cross.corr[,"sprobe"]), as.character(cross.corr[,"gene"]))
comb.rownames.scrunch<-apply(comb.rownames, 1, function(x) { paste(x[1], x[2], x[3], sep=".")})
row.names(comb.data)<-comb.rownames.scrunch
write.table(file="combined-data.txt", round(comb.data, 3), quote=F, sep="\t", na="")

####################################################
# Wed Apr 27 11:57:32 EDT 2005 Basic correlation with dilution profile.
corr<-function(x) { cor(x, dilution, use="pairwise.complete.obs") }
affy.corr<-apply(2**affy.data, 1, corr);
illu.corr<-apply(illu.data, 1, corr);

###################################
# Signfiicance of correlation.
corr.test.all<-function(x) {cor.test(~ x + dilution)$p.value;}
affy.corr.p<-apply(2**affy.data, 1, corr.test.all);
illu.corr.p<-apply(illu.data, 1, corr.test.all);
library(qvalue)
summary(qvalue(affy.corr.p))
summary(qvalue(illu.corr.p))
good.rows<-affy.corr.p[which(qvalue(affy.corr.p)$qvalue<0.05)]
min(abs(affy.corr[names(good.rows)]))
good.rows<-illu.corr.p[which(qvalue(illu.corr.p)$qvalue<0.05)]                                 
min(abs(illu.corr[names(good.rows)]))                                 

                                 
################### FIGURE 3 #######################
ps.options();                                 
postscript("../../../doc/illumina-affy/figure-3.ps", family="Helvetica", horizontal=F, width=8, height=8);                     
par(mfrow=c(2,2), cex=1.2)
a<-hist(illu.corr, breaks=100, plot=F)
plot(col="black", a$mids, a$density, type="s", xlab="Dilution correlation", main="Illumina", ylab="Density", lw=2)
mtext("A", side=3, adj=0, cex=3)
a<-hist(affy.corr, breaks=100, plot=F)
plot(col="black", a$mids, a$density, type="s", xlab="Dilution correlation", main="Affymetrix", ylab="Density", lw=2)
mtext("B", side=3, adj=0, cex=3)
# refseq/nonreseq
a<-hist(illu.corr[ unique(illu.tp.annots.raw[,"probe"] ) ], breaks=200, col="grey", border="grey", xlab="Dilution correlation", plot=F)
b<-hist(illu.corr[ which(!(names(illu.corr) %in% unique(illu.tp.annots.raw[,"probe"]) )) ], breaks=200, col="grey", border="grey", xlab="Dilution correlation", main="Illumina non-Known genes", plot=F)
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Dilution correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
legend(-0.8, 3, legend=c("Illumina known genes", "Illumina 'not-known'"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
mtext("C", side=3, adj=0, cex=3)
a<-hist(affy.corr[ unique(affy.tp.annots.raw[,"probe"] ) ], breaks=200, col="grey", border="grey", xlab="Dilution correlation", plot=F)
b<-hist(affy.corr[ which(!(names(affy.corr) %in% unique(affy.tp.annots.raw[,"probe"]) )) ], breaks=200, col="grey", border="grey", xlab="Dilution correlation", main="Affy non-Known genes", plot=F)
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Dilution correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,4), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,4), lw=2);
legend(-0.8, 3, legend=c("Affymetrix known genes", "Affymetrix 'not-known'"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
mtext("D", side=3, adj=0, cex=3)
dev.off();


############################################
# effect of expression level on non-cross correlation
affy.medians<-apply(2**affy.data, 1, median);
illu.medians<-apply(illu.data, 1, median);

########### FIGURE 4  #################################
ps.options();                                 
postscript("../../../doc/illumina-affy/figure-4.ps", family="Helvetica", horizontal=F, width=8, height=5);                                       
par(mfcol=c(1,2), cex=1.2)
#### A ######  - illumina                     
a<-hist(illu.corr[which(log2(illu.medians) < 8)], col="grey", border="grey", breaks=50, main="Illumina, Exp. < 8", xlab="Correlation", plot=F);
b<-hist(illu.corr[which(log2(illu.medians) >= 8)], col="grey", border="grey", breaks=50, main="Illumina, Exp. > 8", xlab="Correlation", plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,5), lw=2);
mtext("A", side=3, adj=0, cex=3)
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,5), lw=2);
legend(-0.8, 3, legend=c("Illumina, Exp level < 8", "Illumina, Exp level > 8 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
                                 
#### B ##### - affymetrix
a<-hist(affy.corr[which(log2(affy.medians) < 5)], col="grey", border="grey", breaks=50, main="Affymetrix, Expression level < 5", xlab="Correlation", plot=F);
b<-hist(affy.corr[which(log2(affy.medians) >= 5)], col="grey", border="grey", breaks=50, main="Affymetrix, Expression level > 5", xlab="Correlation", plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,5), lw=2);                                 
mtext("B", side=3, adj=0, cex=3)
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,5), lw=2);
legend(-0.8, 3, legend=c("Affymetrix, Exp level < 5", "Affymetrix, Exp level > 5 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
dev.off();
#### Supplementary figures ######
par(mfrow=c(1,2))
plot(illu.corr, log2(illu.medians), pch='.', xlab="Correlation", ylab="Median expression of probe", main="Illumina", ylim=c(5, 15))
plot(affy.corr, log2(affy.medians), pch='.', xlab="Correlation", ylab="Median expression of probe set", main="Affymetrix", ylim=c(0, 15))

# filtering out genes that don't show dilution (see below)


############################################
# Cross-correlation (log transformation is irrelevant if we use Spearman)
cross.corr<-rs(rdataset=affy.data, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman" );
write.table(cross.corr, quote=F, file="cross.corr.txt", sep='\t')
cross.corr<-read.table("cross.corr.txt", header=T, sep='\t', row.names=1)

# pearson correlation, using log-transformed data.
#cross.corr.pearson<-rs(rdataset=affy.data, sdataset=log2(illu.data), sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson" );                                                               
#write.table(cross.corr.pearson, quote=F, file="cross.corr.pearson.txt", sep='\t')
#cross.corr.pearson<-read.table("cross.corr.pearson.txt", header=T, sep='\t', row.names=1)


# pearson correlation, using non-log-transformed data.
affy.data.exp<-2^affy.data
names(affy.data.exp)<-names(affy.data)
row.names(affy.data.exp)<-row.names(affy.data)
affy.data.exp<-data.frame(affy.data.exp)                                 
cross.corr.pearson.nolog<-rs(rdataset=affy.data.exp, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson" );
write.table(cross.corr.pearson.nolog, quote=F, file="cross.corr.pearson.nolog.txt", sep='\t')
cross.corr.pearson.nolog<-read.table("cross.corr.pearson.nolog.txt", header=T, sep='\t', row.names=1)

# to get the pvalues
cross.corr.pvalues<-rs(rdataset=affy.data, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman", test=T );
write.table(cross.corr.pvalues, quote=F, file="cross.corr.pvalues.txt", sep='\t')
cross.corr.pvalues<-read.table("cross.corr.pvalues.txt", header=T, row.names=1)
# pvalues for pearson, using exponentiated affy data.
cross.corr.pvalues.pearson<-rs(rdataset=affy.data.exp, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson", test=T );
write.table(cross.corr.pvalues.pearson, quote=F, file="cross.corr.pvalues.pearson.txt", sep='\t')
cross.corr.pvalues.pearson<-read.table("cross.corr.pvalues.pearson.txt", header=T, row.names=1)

                                 
hist(cross.corr.pvalues.pearson[,"pvalue"])                                 
plot(-log10(cross.corr.pvalues.pearson[,"pvalue"]), cross.corr.pvalues.pearson[,"correl"])
library(qvalue)
summary(qvalue(cross.corr.pvalues.pearson[,"pvalue"]))
                                 
# find the threshold correlation
qvals<-qvalue(cross.corr.pvalues.pearson[,"pvalue"])$qvalues
good.rows<-cross.corr.pvalues.pearson[which(qvals<0.05),]
min(abs(good.rows[,"correl"]))
# bonferroni:                                 
length(which ( (cross.corr.pvalues.pearson[,"pvalue"] * length(cross.corr.pvalues.pearson[,"pvalue"])) < 0.05)) / length(cross.corr.pvalues.pearson[,"pvalue"])


# Thu Aug  4 18:10:13 EDT 2005
# filter the results to remove probes which are not "significantly" diluted.                   Using non-log-transformed data.
affy.corr.good.names<-names(affy.corr.p[ which(qvalue(affy.corr.p)$qvalue < 0.05)])
affy.corr.p.good<-affy.data.exp[names(affy.corr.p[ which(qvalue(affy.corr.p)$qvalue < 0.05)]),]
illu.corr.good.names<-names(illu.corr.p[ which(qvalue(illu.corr.p)$qvalue < 0.05)])                                 
illu.corr.p.good<-illu.data[names(illu.corr.p[ which(qvalue(illu.corr.p)$qvalue < 0.05)]),]
                                 
cross.corr.pvalues.pearson.filtered<-rs(rdataset=affy.corr.p.good, sdataset=illu.corr.p.good, sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson", test=T );
write.table(cross.corr.pvalues.pearson.filtered, quote=F, file="cross.corr.pvalues.pearson.filtered.txt", sep='\t')
cross.corr.pvalues.pearson.filtered<-read.table("cross.corr.pvalues.pearson.filtered.txt", header=T, row.names=1)
                                 
cross.corr.pvalues.filtered<-rs(rdataset=affy.corr.p.good, sdataset=illu.corr.p.good, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman", test=T );                                 
write.table(cross.corr.pvalues.filtered, quote=F, file="cross.corr.pvalues.filtered.txt", sep='\t')                                 
cross.corr.pvalues.filtered<-read.table("cross.corr.pvalues.filtered.txt", header=T, row.names=1)

hist(cross.corr.pvalues.pearson.filtered[,"pvalue"])                                 
summary(qvalue(cross.corr.pvalues.pearson.filtered[,"pvalue"]))
# bonferroni:                                 
length(which ( (cross.corr.pvalues.pearson.filtered[,"pvalue"] * length(cross.corr.pvalues.pearson.filtered[,"pvalue"])) < 0.05)) / length(cross.corr.pvalues.pearson.filtered[,"pvalue"])

hist(cross.corr.pvalues.pearson.filtered[,"correl"])
                                 
                                 
# Thu Aug  4 18:16:44 EDT 2005
# looking at alignment length.                                 
affy.alignment.lengths<-read.table("../seqan/HG-U133_Plus_2_alignment_lengths.txt", header=F)
illu.alignment.lengths<-read.table("../seqan/100CP_alignment_lengths.txt", header=F)
names(illu.alignment.lengths)<-c("probe", "length")
names(affy.alignment.lengths)<-c("probe", "length")
median(affy.alignment.lengths[,2])
median(illu.alignment.lengths[,2])                                 
                                 
affy.puzzling.probes<-read.table("affy.puzzling.probes.txt", header=F)
illu.puzzling.probes<-read.table("illu.puzzling.probes.txt", header=F)

fisher.test(rbind(c(
length(which(illu.alignment.lengths[illu.alignment.lengths[,"probe"] %in% illu.puzzling.probes[,1],"length"] > 50))
,
length(illu.puzzling.probes[,1])
),
c(
length(which(illu.alignment.lengths[,2] > 50))
,
length(illu.alignment.lengths[,1])
  )))


fisher.test(rbind(c(
length(which(affy.alignment.lengths[affy.alignment.lengths[,"probe"] %in% affy.puzzling.probes[,1],"length"] > 450))
                    ,
length(affy.puzzling.probes[,1])
                    )
                  , c(
length(which(affy.alignment.lengths[,2] > 481))
                      ,
length(affy.alignment.lengths[,1])
                      )                  ))
                                 
# Mon Jul 25 14:09:04 EDT 2005
# cross hybridization
# Perl script extracts high-quality blat matches to more than one site in the genome. Files in seqan (file names misleading) - see README
affy.multiple.hits<-read.delim("../seqan/affy.analyzedHits", header=F)
illu.multiple.hits<-read.delim("../seqan/illu.analyzedHits", header=F)

fisher.test(rbind(                                 
c(                                 
length(which (affy.multiple.hits[,1] %in% affy.puzzling.probes[,1]) )
  ,
length(affy.puzzling.probes[,1])
                                ) ,c(
length(affy.multiple.hits[,1])
                                     ,
length(affy.alignment.lengths[,1])                                 
                  )
                  ));               


fisher.test(rbind(                                 
c(                                 
length(which (illu.multiple.hits[,1] %in% illu.puzzling.probes[,1]) )
  ,
length(illu.puzzling.probes[,1])
                                ) ,c(
length(illu.multiple.hits[,1])
                                     ,
length(illu.alignment.lengths[,1])                                 
                  )
                  ));               


par(mfrow=c(1,2))
                                 
a<-hist(log10(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2))
b<-hist(log10(affy.tp.annots[,"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2))
plot(col="darkgrey", a$mids, a$density, type="s", xlab="log(Number of aligned genes or transcripts)", ylab="Density",  main="", xlim=c(0,2), ylim=c(0,2), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(0,2), ylim=c(0,2), lw=2);
legend(1., 1, legend=c("Poor agreement", "Population"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=1.0)
ks.test(hist(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], plot=F)$density, (hist(affy.tp.annots[,"Num.hit.genes.or.transcripts"], plot=F)$density))
mean(affy.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"])
mean(affy.tp.annots[,"Num.hit.genes.or.transcripts"], na.rm=T)                                 


a<-hist(log10(illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2))
b<-hist(log10(illu.tp.annots[,"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2))
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Number of aligned genes or transcripts", ylab="Density",  main="", xlim=c(0,2), ylim=c(0,2), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(0,2), ylim=c(0,2), lw=2);
legend(1., 1, legend=c("Poor agreement", "Population"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=1.0)
        ks.test(hist(illu.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], plot=F)$density, (hist(illu.tp.annots[,"Num.hit.genes.or.transcripts"], plot=F)$density))
mean(illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"])
mean(illu.tp.annots[,"Num.hit.genes.or.transcripts"], na.rm=T)                                 

# comparing across the platforms for the "puzzling" probes:
a<-hist(log10(affy.tp.annots[,"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2))
b<-hist(log10(illu.tp.annots[,"Num.hit.genes.or.transcripts"]), plot=F, breaks=seq(from=0, to=5, by=0.2))
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Number of aligned genes or transcripts", ylab="Density",  main="", xlim=c(0,2), ylim=c(0,2), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(0,2), ylim=c(0,2), lw=2);
legend(1.5, 1, legend=c("Affymetrix", "Illumina"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=1)
ks.test(hist(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], plot=F)$density, (hist(illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], plot=F)$density))
hist(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"] - illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], breaks=200, xlim=c(-15,15))
# plot(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"] - illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], cross.corr[which(rprobe %in% affy.puzzling.probes[,1]),"correl"])                                 
t.test(affy.tp.annots[affy.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], illu.tp.annots[illu.puzzling.probes[,1],"Num.hit.genes.or.transcripts"], paired=T)
                                 
cross.corr.pearson.nolog.pvalues<-rs(rdataset=affy.data.exp, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="pearson", test=T );
write.table(cross.corr.pearson.nolog.pvalues, quote=F, file="cross.corr.pearson.nolog.pvalues.txt", sep='\t')
corrected.pval<-cross.corr.pearson.nolog.pvalues[,"pvalue"] * length(cross.corr.pearson.nolog.pvalues[,"pvalue"])
#                                  


# find threshold for filtering data.
affy.mean<-apply(affy.data, 1, mean)
quantile(affy.mean)
quantile(apply(illu.data, 1, mean), na.rm=T)                                 
                                 
# To repeat analysis with pearson correlation:
# cross.corr<-cross.corr.pearson.nolog                                 
                                 
# Relation between expression and 3' location
everything<-cbind(cross.corr,
                  affy.tp.annots[match(cross.corr[,"rprobe"],
                  row.names(affy.tp.annots)),"threeprimeDist"],
                  illu.tp.annots[match(cross.corr[,"sprobe"],
                  row.names(illu.tp.annots)),"threeprimeDist"] ,
                  affy.tp.annots[match(cross.corr[,"rprobe"],
                  row.names(affy.tp.annots)),"targetStart"] ,
                  affy.tp.annots[match(cross.corr[,"rprobe"],
                  row.names(affy.tp.annots)),"targetEnd"] ,
                  illu.tp.annots[match(cross.corr[,"sprobe"],
                  row.names(illu.tp.annots)),"targetStart"] ,
                  illu.tp.annots[match(cross.corr[,"sprobe"],
                  row.names(illu.tp.annots)),"targetEnd"] )
                                 
#everything<-cbind(everything, min(abs(everything[,"affy.targ.start"] -
#everything[,"illu.targ.start"]),abs(everything[,"affy.targ.end"] -
#everything[,"illu.targ.end"]),abs(everything[,"affy.targ.start"] -
#everything[,"illu.targ.end"]),abs(everything[,"affy.targ.start"] -
#everything[,"illu.targ.end"] )))
names(everything)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist", "affy.targ.start", "affy.targ.end", "illu.targ.start", "illu.targ.end")
                                 attach(everything)
diff<- abs(((everything[,"affy.targ.end"] - everything[,"affy.targ.start"])/2 + affy.targ.start) - ((everything[,"illu.targ.end"] - everything[,"illu.targ.start"])/2 + illu.targ.start));
everything<-cbind(everything,diff)
names(everything)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist", "affy.targ.start", "affy.targ.end", "illu.targ.start", "illu.targ.end", "diff")
# Thu May 12 10:42:39 EDT 2005
# in the cross-platform agreement, are the disagreement among genes
# that do not look like the dilution profile in either platform?
affy.corr.ev<-affy.corr[as.character(everything[,"affyprobe"])]
illu.corr.ev<-illu.corr[as.character(everything[,"illuprobe"])]
everything<-data.frame(everything, affy.corr.ev, illu.corr.ev, pmax(abs(affy.corr.ev), abs(illu.corr.ev)))
names(everything)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist", "affy.targ.start", "affy.targ.end", "illu.targ.start", "illu.targ.end", "diff", "aff.corr", "illu.corr", "max.corr.single")
attach(everything)
write.table(file="everything.txt", everything, sep='\t', quote=F)
everything<-read.delim("everything.txt", header=T, row.names=1)                  
############################################
### FIGURE 5 #########
ps.options()
postscript("../../../doc/illumina-affy/figure-5.ps", family="Helvetica", horizontal=T, width=10, height=4.5);                                       
par(mfrow=c(1, 3), cex=1.2)
### A #####
a<-hist(correl, col="grey", breaks=50, border="grey", main="Cross-platform comparison of profiles", xlab="Correlation" , xlim=c(-1,1), plot=F)
plot(col="black", a$mids, a$density, type="s", xlab="Correlation cross-platform,\nall genes", ylab="Density",  main="", lw=2);
mtext("A", side=3, adj=0, cex=3)
### B ####
highmax<-which(max.corr.single > 0.8)
a<-hist(correl[highmax], breaks=50, plot=F)
b<-hist(correl[-highmax], breaks=50, plot=F)                                 
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation cross-platform", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,5), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,5), lw=2);
legend(-0.8, 3, legend=c("High dilution effect (>0.8)", "Low dilution effect (<0.8)"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
mtext("B", side=3, adj=0, cex=3)
### C ####
psg<-read.table("../placenta-genes")
a<-hist(cross.corr[which(cross.corr[,"gene"] %in% psg[,"V1"]),"correl"], col="grey", border="grey", breaks=50, main="Agreement cross-platform for placenta/blood genes", xlab="Correlation", xlim=c(-1,1), plot=F)
plot(col="black", a$mids, a$density, type="s", xlab="Correlation cross-platform,\nplacenta/blood genes", ylab="Density",  main="", xlim=c(-1,1), lw=2, cex=0.5);
mtext("C", side=3, adj=0, cex=3)
dev.off();                                 
### supplement
plot(max.corr.single, correl, pch=20, cex=0.5, xlab="Best single-data set dilution profile correlation", ylab="Cross-platform correlation", main="All probes");


############# FIGURE 6 - effect of expression level ##################3
attach(everything)
ps.options()
postscript("../../../doc/illumina-affy/figure-6.ps", family="Helvetica", horizontal=F, width=8, height=5);                                       
par(mfrow=c(1, 2), cex=1.2)
#### A #####
a<-hist(correl[which(explev.affy<5)], breaks=50, plot=F);
b<-hist(correl[which(explev.affy>=5)], breaks=50, plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Cross-platform correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
mtext("A", side=3, adj=0, cex=3)
legend(-0.8, 3, legend=c("log2 Exp level < 5", "log2 Exp level > 5 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
### B #####
a<-hist(correl[which(log10(diff) <= 2.69)], breaks=50, plot=F);
b<-hist(correl[which(log10(diff) > 2.69)], breaks=50, plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Cross-platform correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
mtext("B", side=3, adj=0, cex=3)
legend(-0.8, 3, legend=c("Probe spacing < 300bp", "Probe spacing > 300bp"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
dev.off();
# Supplement
plot(explev.affy, jitter(correl), pch=20, cex=0.2, xlab="Expression level", ylab="Correlation", main="Effect of expression level on cross-platform agreement")
plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))")
c<-hist(correl[explev.affy<5 & (illuprobe %in% illu.corr.good.names  | affyprobe %in% affy.corr.good.names)], breaks=50, plot=F);
d<-hist(correl[explev.affy>=5 & (illuprobe %in% illu.corr.good.names  | affyprobe %in% affy.corr.good.names)], breaks=50, plot=F);
plot(col="darkgrey", c$mids, c$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
par(new=T);
plot(col="black", d$mids, d$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);                             
legend(-0.8, 3, legend=c("Exp level < 5, sig. dilution effect", "Exp level > 5, sig dilution effect "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)

a<-hist(correl[log10(diff) <= 2.69 & (illuprobe %in% illu.corr.good.names  | affyprobe %in% affy.corr.good.names)], breaks=50, plot=F);
b<-hist(correl[log10(diff) > 2.69 & (illuprobe %in% illu.corr.good.names  | affyprobe %in% affy.corr.good.names)], breaks=50, plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Cross-platform correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
legend(-0.8, 3, legend=c("Probe spacing < 300bp, diluton effect", "Probe spacing > 300bp, dilution effect"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
                                 
###############################################
# statistics for the text.                                 

# number of mappable affy probes                                 
length(unique(everything[,"affyprobe"]))

# number of mappable illumina probes
length(unique(everything[,"illuprobe"]))

#number of mapped genes                                 
length(unique(everything[,"gene"]))

# percent of probes with corrleations > 0.8 with dilution profile
length(which(abs(affy.corr) > 0.8))/length(affy.corr)
length(which(abs(illu.corr) > 0.8))/length(illu.corr)
                                 
30730/length(affy.corr)
12041/length(illu.corr)


################ MORE SUPPLEMENTARY DATA ######################################

#### pairwise plots of all the data                                 
plot(illu.data, pch='.')
plot(log2(illu.data), pch='.')
plot(affy.data, pch='.')
plot(2**(affy.data), pch='.')

                                 
# Tue May  3 22:09:41 EDT 2005
# using the manufacturer's annotations. 
cross.corr.mfg<-rs(rdataset=affy.data, sdataset=illu.data, sarray=illu.mfg.annots, rarray=affy.mfg.annots, method="spearman" );
write.table(cross.corr.mfg, quote=F, file="cross.corr.mfg.txt", sep='\t')
attach(cross.corr.mfg)
par(mfrow=c(1,2))
hist(correl, breaks=50, col="grey", border="grey", main="Cross-platform comparison of profiles \n(Mfg. annots)", xlab="Correlation" , xlim=c(-1,1))
hist(cross.corr[which(cross.corr[,"gene"] %in% psg[,"V1"]),"correl"], col="grey", border="grey", breaks=50, main="Agreement cross-platform for placenta/blood genes \n(Mfg. annots)", xlab="Correlation", xlim=c(-1,1))

# put all 4 on one graph
par(mfrow=c(2,2))
attach(cross.corr)
hist(correl, breaks=50, col="grey", border="grey", main="Cross-platform comparison of profiles", xlab="Correlation" , xlim=c(-1,1))
hist(cross.corr[which(cross.corr[,"gene"] %in% psg[,"V1"]),"correl"], col="grey", border="grey", breaks=50, main="Agreement cross-platform for placenta/blood genes", xlab="Correlation", xlim=c(-1,1))
attach(cross.corr.mfg)
hist(correl, breaks=50, col="grey", border="grey", main="Cross-platform comparison of profiles \n(Mfg. annots)", xlab="Correlation" , xlim=c(-1,1))
hist(cross.corr[which(cross.corr[,"gene"] %in% psg[,"V1"]),"correl"], col="grey", border="grey", breaks=50, main="Agreement cross-platform for placenta/blood genes \n(Mfg. annots)", xlab="Correlation", xlim=c(-1,1))
attach(cross.corr)


### Three prime location of probes. (all)
par(mfrow=c(1,2))
hist(log10(affy.tp.annots.raw[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="distance from 3' end of gene (log10 bases)", main="Affymetrix probe locations", xlim=c(0,8), freq=F)
hist(log10(illu.tp.annots.raw[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="distance from 3' end of gene (log10 bases)", main="Illumina probe locations", xlim=c(0,8), freq=F)

# Best
par(mfrow=c(1,2))
hist(log10(affy.tp.annots[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="Best distance from 3' end of gene (log10 bases)", main="Affymetrix probe locations", xlim=c(0,7), freq=F)
hist(log10(illu.tp.annots[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="Best distance from 3' end of gene (log10 bases)", main="Illumina probe locations", xlim=c(0,7), freq=F)


# Exon overlaps (not very interesing)
par(mfrow=c(1,2))
hist(illu.tp.annots[,"exonoverlap"], col="grey", border="grey", breaks=50, xlab="Exon overlap, fraction of probe length", main="Illumina");
hist(affy.tp.annots[,"exonoverlap"], col="grey", border="grey", breaks=50, xlab="Exon overlap, fraction of probe length", main="Affymetrix");

# Effect of overlap on signal
par(mfcol=c(2,2))
hist(log2(apply(illu.data[row.names(illu.tp.annots)[which(illu.tp.annots[,"exonoverlap"] ==1.0)],], 1, mean)), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap = 1.0", xlab="log2 Median expression")
hist(log2(apply(illu.data[row.names(illu.tp.annots)[which(illu.tp.annots[,"exonoverlap"] <1.0)],], 1, mean)), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap <1.0", xlab="log 2Median expression")
hist(apply(affy.data[row.names(affy.tp.annots)[which(affy.tp.annots[,"exonoverlap"] ==1.0)],], 1, mean), breaks=200, border="grey", col="grey", main="Affy, Exon overlap = 1.0", xlab="Median expression")
hist(apply(affy.data[row.names(affy.tp.annots)[which(affy.tp.annots[,"exonoverlap"] <1.0)],], 1, mean), breaks=200, border="grey", col="grey", main="Affy, Exon overlap <1.0", xlab="Median expression")

# Relation between exon overlap and threeprime dist -- in the RAW annotations.
par(mfcol=c(2,2))
hist(log10(illu.tp.annots.raw[,"threeprimeDist"][which(illu.tp.annots.raw[,"exonoverlap"]/illu.tp.annots.raw[,"Probe.Length"] == 1.0)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap=1", xlim=c(0,6), ylim=c(0,2000), xlab="log10(3'distance)")
hist(log10(illu.tp.annots.raw[,"threeprimeDist"][which(illu.tp.annots.raw[,"exonoverlap"]/illu.tp.annots.raw[,"Probe.Length"] < 1)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap<1", xlim=c(0,6), ylim=c(0,2000), xlab="log10(3'distance)")
hist(log10(affy.tp.annots.raw[,"threeprimeDist"][which(affy.tp.annots.raw[,"exonoverlap"]/affy.tp.annots.raw[,"Probe.Length"] == 1)]), breaks=200, border="grey", col="grey", main="Exon overlap=1", xlim=c(0,6), ylim=c(0,2000), xlab="log10(3'distance)")
hist(log10(affy.tp.annots.raw[,"threeprimeDist"][which(affy.tp.annots.raw[,"exonoverlap"]/affy.tp.annots.raw[,"Probe.Length"] < 1)]), breaks=200, border="grey", col="grey", main="Exon overlap<1", xlim=c(0,6), ylim=c(0,2000), xlab="log10(3'distance)")

# same, but for BEST 
par(mfcol=c(2,2))
hist(log10(illu.tp.annots[,"threeprimeDist"][which(illu.tp.annots[,"exonoverlap"] == 1.0)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap=1", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)")
hist(log10(illu.tp.annots[,"threeprimeDist"][which(illu.tp.annots[,"exonoverlap"] < 1)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap<1", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)")
hist(log10(affy.tp.annots[,"threeprimeDist"][which(affy.tp.annots[,"exonoverlap"] == 1)]), breaks=200, border="grey", col="grey", main="Affymetrix, Exon overlap=1 (best hits)", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)")
hist(log10(affy.tp.annots[,"threeprimeDist"][which(affy.tp.annots[,"exonoverlap"] < 1)]), breaks=200, border="grey", col="grey", main="Affymetrix, Exon overlap<1 (best hits)", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)")


# relation of threeprime distance with profile correlation - very little to see.
plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="abs(log10(distance between probes across platforms))")

# probe location effect, more
# for genes expressed at high levels
highexp<-which(explev.affy > 7);
plot(log10(diff[highexp]), correl[highexp], pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))")
a<-hist(correl[which(log10(diff) <= 3 & explev.affy > 7)],  breaks=50, col="grey", border="grey", xlab="Dilution correlation, probe spacing < 300bp", main="", plot=F);
b<-hist(correl[which(log10(diff) > 3 & explev.affy > 7)],  breaks=50, col="grey", border="grey", xlab="Dilution correlation, probe spacing > 300bp", main="", plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,8), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,8), lw=2);
legend(-0.8, 3, legend=c("Dilution correlation, probe spacing < 300bp, high expression", "Dilution correlation, probe spacing > 300bp, high expression"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2)
cor.test(log10(diff[highexp]), correl[highexp], method="spearman")

# linear model -- not so good!
meanexp<-(explev.affy + log2(explev.illu))/2
zcorrel<-(log(1+correl)-(log(1-correl)))/2
logdiff<-log10(diff+1);
use<-which(!is.na(logdiff))
aov.res<-lm(correl ~ diff + meanexp)
summary(aov.res)


# Wed May  4 14:36:14 EDT 2005
# computing fold changes
foldchange<-function(x, islog=FALSE) {
  low<-mean(x[1:2]);
  high<-mean(x[11:12]);
  foldchange<-0;
  if (islog) {
    foldchange<-low - high;
  } else {
    foldchange<-log2(low/high)
  }
  foldchange;
}

affy.foldchange<-apply(affy.data, 1, foldchange, islog=T)
illu.foldchange<-apply(illu.data, 1, foldchange, islog=F);
############################################
#par(mfrow=c(2, 1))
a<-hist(illu.foldchange, breaks=50, freq=FALSE, plot=F  )
b<-hist(affy.foldchange, breaks=50, freq=FALSE, plot=F  )
plot(a$mids, log(a$density), type="s",  ylab="log10(Density)", main="Fold changes compared", xlab="Fold change (log2 units)", xlim=c(-10,10), ylim=c(-10,1.5), col="darkgrey", lw=2)
par(new=T)
plot(b$mids, log(b$density), type="s", main="", xlab="", ylab="", xlim=c(-10,10), ylim=c(-10,1.5), col="black", lw=2)                                 
legend(-0.8, 3, legend=c("Illumina", "Affymetrix"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2)


foldchange.diff<-rs(rdataset=affy.foldchange, sdataset=illu.foldchange, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman", f=function(x,y){x-y;} );
hist(foldchange.diff[,4], col="grey", border="grey", breaks=50, main="Difference between platforms", xlab="difference in log2 fold change")


# Wed May 11 21:32:40 EDT 2005
# Examine 3' distance using middle.
affy.mid.annots<-read.delim("../seqan/HG-U133_Plus_2_probeCollapsed.best.fa.19Apr2005-1033.blat-annots.center.txt", header=F, row.names=1)
names(affy.mid.annots)<-c("array", "gene", "transcript", "Num.hit.genes.or.transcripts", "Blat.score", "exonoverlap", "threeprimeDist")
illu.mid.annots<-read.delim("../seqan/100CP_target.best.19Apr2005-1246.blat-annots.center.txt", header=F, row.names=1)
names(illu.mid.annots)<-c("array", "gene", "transcript", "Num.hit.genes.or.transcripts", "Blat.score", "exonoverlap", "threeprimeDist")
#write.table(affy.mid.annots, "affy.mid.annots.txt", sep='\t', quote=F)
#write.table(illu.mid.annots, "illu.mid.annots.txt", sep='\t', quote=F)
affy.mid.annots<-read.delim("affy.mid.annots.txt", header=T, row.names=1)
illu.mid.annots<-read.delim("illu.mid.annots.txt", header=T, row.names=1)                                 
par(mfcol=c(2,2))
hist(log10(illu.mid.annots[,"threeprimeDist"][which(illu.mid.annots[,"exonoverlap"] == 1.0)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap=1", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)")
hist(log10(illu.mid.annots[,"threeprimeDist"][which(illu.mid.annots[,"exonoverlap"] < 1)]), breaks=200, border="grey", col="grey", main="Illumina, Exon overlap<1", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)")
hist(log10(affy.mid.annots[,"threeprimeDist"][which(affy.mid.annots[,"exonoverlap"] == 1)]), breaks=200, border="grey", col="grey", main="Affymetrix, Exon overlap=1 (best hits)", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)")
hist(log10(affy.mid.annots[,"threeprimeDist"][which(affy.mid.annots[,"exonoverlap"] < 1)]), breaks=200, border="grey", col="grey", main="Affymetrix, Exon overlap<1 (best hits)", xlim=c(0,6), ylim=c(0,1500), xlab="log10(3'distance)")
par(mfrow=c(1,2))
hist(log10(affy.mid.annots[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="Best center distance from 3' end of gene (log10 bases)", main="Affymetrix probe locations", xlim=c(0,7), freq=F)
hist(log10(illu.mid.annots[,"threeprimeDist"]), breaks=500, col="grey", border="grey", xlab="Best center distance from 3' end of gene (log10 bases)", main="Illumina probe locations", xlim=c(0,7), freq=F)
everything.mid<-cbind(cross.corr, affy.mid.annots[match(cross.corr[,"rprobe"], row.names(affy.mid.annots)),"threeprimeDist"], illu.mid.annots[match(cross.corr[,"sprobe"], row.names(illu.mid.annots)),"threeprimeDist"])
names(everything.mid)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist")
everything.mid<-cbind(everything.mid, abs(everything.mid[,"affy.dist"] - everything.mid[,"illu.dist"]))
names(everything.mid)<-c("gene", "affyprobe", "illuprobe", "correl", "explev.affy", "explev.illu", "affy.dist", "illu.dist", "diff")
attach(everything.mid)
plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))")
a<-hist(correl[which(log10(diff) <= 3)],  breaks=50, col="grey", border="grey", xlab="Dilution correlation, probe spacing < 300bp", main="", plot=F);
b<-hist(correl[which(log10(diff) > 3)],  breaks=50, col="grey", border="grey", xlab="Dilution correlation, probe spacing > 300bp", main="", plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,5), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,5), lw=2);
legend(-0.8, 3, legend=c("Dilution correlation, probe spacing < 300bp", "Dilution correlation, probe spacing > 300bp"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2)


# Thu May 12 13:33:34 EDT 2005 combining selection criteria....not so successful.
reallygood<-which((max.corr.single > 0.8) & (explev.affy > 5)  )
a<-hist(correl[reallygood], breaks=103)
b<-hist(correl[-reallygood], breaks=103)                                 
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,8), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,8), lw=2);
legend(-0.8, 3, legend=c("Probes meeting all three criteria", "Everything else"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2)


save.image();


# Mon Jul 25 16:48:28 EDT 2005
# extracting 3' probe. See README in ../../../data/0305/Affy. Trim and rearrange columns by hand.
affy.3pdata<-read.table("../../../data/0305/Affy/affy.3pm.data.txt", header=T, row.names=1)
affy.3p.corr<-apply(affy.3pdata, 1, corr);
                                 
cross.3pa.corr<-rs(rdataset=affy.3pdata, sdataset=illu.data, sarray=illu.tp.annots, rarray=affy.tp.annots, method="spearman" );
write.table(cross.3pa.corr, quote=F, file="cross.3pa.corr.txt", sep='\t')
cross.3pa.corr<-read.table("cross.3pa.corr.txt", header=T, row.names=1);


# Fri Aug  5 09:24:38 EDT 2005
# within-dataset comparisons
illu.cross.within<-cross.corr.within(illu.data, illu.tp.annots)
write.table(illu.cross.within, "illu.cross.within.txt", quote=F, sep='\t')
affy.cross.within<-cross.corr.within(affy.data, affy.tp.annots)                                                                                                   
write.table(affy.cross.within, "affy.cross.within.txt", quote=F, sep='\t')
illu.cross.within<-read.table("illu.cross.within.txt", header=T, row.names=1)
affy.cross.within<-read.delim("affy.cross.within.txt", header=T, row.names=1, sep='\t')                                 


# useful statistics:
# fraction of agreement over 0.8                                 
length(which(illu.cross.within[,"correl"] > 0.8)) / length(illu.cross.within[,"correl"])
length(which(affy.cross.within[,"correl"] > 0.8)) / length(affy.cross.within[,"correl"])                                 
## supplementary figures:                                 
hist(illu.cross.within[,"correl"] , breaks=50, col="grey", border="grey", main="Within-platform comparison of profiles (Illumina)", xlab="Correlation" , xlim=c(-1,1))
hist(affy.cross.within[,"correl"] , breaks=50, col="grey", border="grey", main="Within-platform comparison of profiles (Affymetrix)", xlab="Correlation" , xlim=c(-1,1))                                 

# make combined data structure so we can complete analyses easily like for the full data.
## Illumina                                
attach(illu.cross.within);                                 
everything.illu.within<-cbind(illu.cross.within,
                  illu.tp.annots[match(illu.cross.within[,"rprobe"],
                  row.names(illu.tp.annots)),"threeprimeDist"],
                  illu.tp.annots[match(illu.cross.within[,"sprobe"],
                  row.names(illu.tp.annots)),"threeprimeDist"] ,
                  illu.tp.annots[match(illu.cross.within[,"rprobe"],
                  row.names(illu.tp.annots)),"targetStart"] ,
                  illu.tp.annots[match(illu.cross.within[,"rprobe"],
                  row.names(illu.tp.annots)),"targetEnd"] ,
                  illu.tp.annots[match(illu.cross.within[,"sprobe"],
                  row.names(illu.tp.annots)),"targetStart"] ,
                  illu.tp.annots[match(illu.cross.within[,"sprobe"],
                  row.names(illu.tp.annots)),"targetEnd"] )
                                 
names(everything.illu.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end")
attach(everything.illu.within)
diff<- abs(((everything.illu.within[,"r.targ.end"] - everything.illu.within[,"r.targ.start"])/2 + r.targ.start) - ((everything.illu.within[,"s.targ.end"] - everything.illu.within[,"s.targ.start"])/2 + s.targ.start));
everything.illu.within<-cbind(everything.illu.within,diff)
names(everything.illu.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end", "diff")
r.corr.ev<-illu.corr[as.character(everything.illu.within[,"rprobe"])]
s.corr.ev<-illu.corr[as.character(everything.illu.within[,"sprobe"])]
everything.illu.within<-data.frame(everything.illu.within, r.corr.ev, s.corr.ev, pmax(abs(r.corr.ev), abs(s.corr.ev)))
names(everything.illu.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end", "diff", "r.corr", "s.corr", "max.corr.single")
attach(everything.illu.within)
write.table(file="everything.illu.within.txt", everything.illu.within, sep='\t', quote=F)

### AFFY                                 
attach(affy.cross.within);                                 
everything.affy.within<-cbind(affy.cross.within,
                  affy.tp.annots[match(affy.cross.within[,"rprobe"],
                  row.names(affy.tp.annots)),"threeprimeDist"],
                  affy.tp.annots[match(affy.cross.within[,"sprobe"],
                  row.names(affy.tp.annots)),"threeprimeDist"] ,
                  affy.tp.annots[match(affy.cross.within[,"rprobe"],
                  row.names(affy.tp.annots)),"targetStart"] ,
                  affy.tp.annots[match(affy.cross.within[,"rprobe"],
                  row.names(affy.tp.annots)),"targetEnd"] ,
                  affy.tp.annots[match(affy.cross.within[,"sprobe"],
                  row.names(affy.tp.annots)),"targetStart"] ,
                  affy.tp.annots[match(affy.cross.within[,"sprobe"],
                  row.names(affy.tp.annots)),"targetEnd"] )
                                 
names(everything.affy.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end")
attach(everything.affy.within)
diff<- abs(((everything.affy.within[,"r.targ.end"] - everything.affy.within[,"r.targ.start"])/2 + r.targ.start) - ((everything.affy.within[,"s.targ.end"] - everything.affy.within[,"s.targ.start"])/2 + s.targ.start));
everything.affy.within<-cbind(everything.affy.within,diff)
names(everything.affy.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end", "diff")
r.corr.ev<-affy.corr[as.character(everything.affy.within[,"rprobe"])]
s.corr.ev<-affy.corr[as.character(everything.affy.within[,"sprobe"])]
everything.affy.within<-data.frame(everything.affy.within, r.corr.ev, s.corr.ev, pmax(abs(r.corr.ev), abs(s.corr.ev)))
names(everything.affy.within)<-c("gene", "rprobe", "sprobe", "pvalue", "explev.r", "explev.s", "correl", "r.dist", "s.dist", "r.targ.start", "r.targ.end", "s.targ.start", "s.targ.end", "diff", "r.corr", "s.corr", "max.corr.single")
attach(everything.affy.within)
write.table(file="everything.affy.within.txt", everything.affy.within, sep='\t', quote=F)
                                 
                                 
## analysis of the above within-dataset agreements.
attach(everything.illu.within)
# full data
par(mfrow=c(1, 2), cex=1.2)                                 
plot(log2(explev.s), jitter(correl), pch=20, cex=0.2, xlab="Expression level", ylab="Correlation")
plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))")
par(mfrow=c(1, 2), cex=1.2)
#### A #####
a<-hist(correl[which(explev.s<5)], breaks=50, plot=F);
b<-hist(correl[which(explev.s>=5)], breaks=50, plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Cross-platform correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
mtext("A", side=3, adj=0, cex=3)
legend(-0.8, 3, legend=c("log2 Exp level < 5", "log2 Exp level > 5 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
### B #####
a<-hist(correl[which(log10(diff) <= 2.69)], breaks=50, plot=F);
b<-hist(correl[which(log10(diff) > 2.69)], breaks=50, plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Within-platform correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
mtext("B", side=3, adj=0, cex=3)
legend(-0.8, 3, legend=c("Probe spacing < 300bp", "Probe spacing > 300bp"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)


attach(everything.affy.within)
# full data
par(mfrow=c(1, 2), cex=1.2)                                 
plot(explev.s, jitter(correl), pch=20, cex=0.2, xlab="Expression level", ylab="Correlation")
plot(log10(diff), correl, pch=20, cex=0.2, ylab="Correlation of profiles across platforms", xlab="Abs(log10(distance between probes across platforms))")

par(mfrow=c(1, 2), cex=1.2)
#### A #####
a<-hist(correl[which(explev.s<5)], breaks=50, plot=F);
b<-hist(correl[which(explev.s>=5)], breaks=50, plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Within-platform correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
mtext("A", side=3, adj=0, cex=3)
legend(-0.8, 3, legend=c("log2 Exp level < 5", "log2 Exp level > 5 "), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
### B #####
a<-hist(correl[which(log10(diff) <= 2.69)], breaks=50, plot=F);
b<-hist(correl[which(log10(diff) > 2.69)], breaks=50, plot=F);
plot(col="darkgrey", a$mids, a$density, type="s", xlab="Within-platform correlation", ylab="Density",  main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
par(new=T);
plot(col="black", b$mids, b$density, type="s", xlab="", ylab="", main="", xlim=c(-1,1), ylim=c(0,3), lw=2);
mtext("B", side=3, adj=0, cex=3)
legend(-0.8, 3, legend=c("Probe spacing < 300bp", "Probe spacing > 300bp"), bty="n", col=c("darkgrey", "black"), lty=1, lw=2, cex=0.5)
                                 
affy.puzzling.within.probes<-read.table("affy.puzzling.within.probes.txt", header=F)
illu.puzzling.within.probes<-read.table("illu.puzzling.within.probes.txt", header=F)
length(which(as.character(affy.puzzling.within.probes[,1]) %in% as.character(affy.puzzling.probes[,1])))
length(which(as.character(illu.puzzling.within.probes[,1]) %in% as.character(illu.puzzling.probes[,1])))
fisher.test(rbind(c(3,10),c(940,36023)))
fisher.test(rbind(c(37,103),c(940,36023)))