###############################################3
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 12 colors (12 different days)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011
## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")
rm(list=ls())
wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)
read.from.txt = 1
genes <- read.table('GEMINI37.rma.txt', header=T)
# read.from.txt = 0
# library(affy)
# data <- ReadAffy()
# genes <- as.matrix(exprs(rma(data)))
# ## Or you can try mas5
# #genes <- log2(as.matrix(exprs(mas5(data))))
genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)
library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]
files.info <- data.frame()
cel.files <- list.files(wk.dir, pattern = ".CEL", ignore.case = T)
for (i in 1:length(cel.files))
{
cel.file <- paste(wk.dir, cel.files[i], sep="\\")
cel.header <- readLines(cel.file, 20)
datheader.line <- 0
for (line in 1:length(cel.header))
{
if ( substr(cel.header[line], 1, 9)=="DatHeader" )
{
datheader.line <- line
break
}
}
datheader.line
header.fields <- strsplit(cel.header[datheader.line], " ")
cel.filename <- strsplit(header.fields[[1]][3], ":")[[1]][1]
exp.date <- header.fields[[1]][18]
exp.time <- header.fields[[1]][19]
if (read.from.txt) {
cel.files[i] <- gsub("-", ".", cel.files[i])
cel.files[i] <- gsub("\\+", ".", cel.files[i])
cel.files[i] <- gsub("\\(", ".", cel.files[i])
cel.files[i] <- gsub("\\)", ".", cel.files[i])
if(length(grep("^[0-9]", cel.files[i]))) cel.files[i]<- paste("X", cel.files[i], sep="")
}
files.info[cel.files[i],1] <- cel.filename
files.info[cel.files[i],2] <- exp.date
files.info[cel.files[i],3] <- exp.time
files.info[cel.files[i],4] <- "#000000"
}
colnames(files.info) <- c("SAMPLE_ID", "EXP_DATE", "EXP_TIME", "COLOR")
files.info
#files.info[with(files.info, order(EXP_DATE, EXP_TIME)), ]
#files.info[order(chron(names(files.info)))]
date.factor <- as.factor(files.info$EXP_DATE)
date.levels <- levels(date.factor)
nlevels(date.factor)
###########################
# Generate a vector of colors that are as different from one another as possible
barvy<-rainbow(12)
vyber<-c(1,7,3,9,5,11,2,8,4,10,6,12)
color.list <- c("red", "yellow", "green", "cyan", "blue", "purple")
num.of.colors <- nlevels(date.factor)
for (c in 1:num.of.colors)
{
files.info$COLOR[which(files.info$EXP_DATE==date.levels[c])] <- color.list[c]
num.in.batch <- length(which(files.info$EXP_DATE==date.levels[c]))
date.levels[c] <- paste(date.levels[c], num.in.batch, sep=" n=")
}
files.info
#rownames(genes)
write.table(files.info, file="files.info.user.batch.txt",sep="\t")
group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 6)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
}
text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)
pdf(file="Batch_Effect_Date.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 12)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
}
text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)
dev.off()
print(paste("The figure is saved as ", wk.dir, "Batch_Effect_Date.pdf ", sep="\\"))
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
###############################################3
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 7 colors (red, yellow, green, cyn, blue, purple, black)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011
## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")
rm(list=ls())
wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)
genes <- read.table('GEMINI37.rma.txt', header=T)
genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)
files.info <- data.frame()
files.info <- read.table("files.info.user.batch.txt", header=T, sep="\t")
rownames(files.info) <- files.info[,1]
head(files.info)
library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]
#rownames(genes)
group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
pdf(file="Batch_Effect_Xbatches.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
dev.off()
No comments:
Post a Comment