Zhengdeng Lei, PhD

Zhengdeng Lei, PhD

2009 - Present Research Fellow at Duke-NUS, Singapore
2007 - 2009 High Throughput Computational Analyst, Memorial Sloan-Kettering Cancer Center, New York
2003 - 2007 PhD, Bioinformatics, University of Illinois at Chicago

Saturday, April 2, 2011

Check batch effect in your affy gene expression data

###############################################3
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 12 colors (12 different days)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011

## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")


rm(list=ls())

wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)

 read.from.txt = 1
 genes <- read.table('GEMINI37.rma.txt', header=T)
# read.from.txt = 0
#  library(affy)
#  data <- ReadAffy()
#  genes <- as.matrix(exprs(rma(data)))
#  ## Or you can try mas5
#  #genes <- log2(as.matrix(exprs(mas5(data))))


genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)


library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]




files.info <- data.frame()
cel.files <- list.files(wk.dir, pattern = ".CEL", ignore.case = T)
for (i in 1:length(cel.files))
{
 cel.file <- paste(wk.dir, cel.files[i], sep="\\")
 cel.header <- readLines(cel.file, 20)

 datheader.line <- 0
 for (line in 1:length(cel.header))
 {
  if ( substr(cel.header[line], 1, 9)=="DatHeader" )
  {
   datheader.line <- line
   break
  }
 }

 datheader.line

 header.fields <- strsplit(cel.header[datheader.line], " ")

 cel.filename <- strsplit(header.fields[[1]][3], ":")[[1]][1]
 exp.date <- header.fields[[1]][18]
 exp.time <- header.fields[[1]][19]

if (read.from.txt) {
cel.files[i] <- gsub("-", ".", cel.files[i])
cel.files[i] <- gsub("\\+", ".", cel.files[i])
cel.files[i] <- gsub("\\(", ".", cel.files[i])
cel.files[i] <- gsub("\\)", ".", cel.files[i])

if(length(grep("^[0-9]", cel.files[i]))) cel.files[i]<- paste("X", cel.files[i], sep="")
}

 files.info[cel.files[i],1] <- cel.filename
 files.info[cel.files[i],2] <- exp.date
 files.info[cel.files[i],3] <- exp.time
 files.info[cel.files[i],4] <- "#000000"

}
colnames(files.info) <- c("SAMPLE_ID", "EXP_DATE", "EXP_TIME", "COLOR")
files.info






#files.info[with(files.info, order(EXP_DATE, EXP_TIME)), ]
#files.info[order(chron(names(files.info)))]





date.factor <- as.factor(files.info$EXP_DATE)
date.levels <- levels(date.factor)
nlevels(date.factor)
###########################
# Generate a vector of colors that are as different from one another as possible
barvy<-rainbow(12)
vyber<-c(1,7,3,9,5,11,2,8,4,10,6,12)
color.list <- c("red", "yellow", "green", "cyan", "blue", "purple")

num.of.colors <- nlevels(date.factor)
for (c in 1:num.of.colors)
{
 files.info$COLOR[which(files.info$EXP_DATE==date.levels[c])] <- color.list[c]
 num.in.batch <- length(which(files.info$EXP_DATE==date.levels[c]))
 date.levels[c] <- paste(date.levels[c], num.in.batch, sep="   n=")
}

files.info

#rownames(genes)
write.table(files.info, file="files.info.user.batch.txt",sep="\t")




group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]




par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)

#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 6)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)

legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)

}

text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)





pdf(file="Batch_Effect_Date.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 12)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)

legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)

}

text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)


dev.off()
print(paste("The figure is saved as ", wk.dir, "Batch_Effect_Date.pdf ", sep="\\"))

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////



###############################################3
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 7 colors (red, yellow, green, cyn, blue, purple, black)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011

## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")


rm(list=ls())
wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)
genes <- read.table('GEMINI37.rma.txt', header=T)


genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)





files.info <- data.frame()
files.info <- read.table("files.info.user.batch.txt", header=T, sep="\t")
rownames(files.info) <- files.info[,1]
head(files.info)


library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]

#rownames(genes)



group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]



par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)



pdf(file="Batch_Effect_Xbatches.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
dev.off()

No comments:

Post a Comment