Zhengdeng Lei, PhD

Zhengdeng Lei, PhD

2009 - Present Research Fellow at Duke-NUS, Singapore
2007 - 2009 High Throughput Computational Analyst, Memorial Sloan-Kettering Cancer Center, New York
2003 - 2007 PhD, Bioinformatics, University of Illinois at Chicago

Tuesday, April 26, 2011

Median Centered

data.log10.MeanCentered <- sweep(data.log10, 2, apply(data.log10, 2, median))

Tuesday, April 19, 2011

Enrichment test in cancer subtypes





Invasive
Proliferative
Metabolic
Hypermethylated  
4110       
1536         
988   
Hypomethylated   
625        
1263         
284

Q1: Is the Invasive subtype enriched with Hypermethylated CpGs?
q <- 4110 # number of success (white balls) drawn
m <- 4110+1536+988 # number of success (white) in the urn
n <- 625+1263+284 # number of fail (black) in the urn
k <- 4110+625 # number of balls drawn out
p.value <- phyper(q-1,m,n,k, lower.tail=F)
# why q-1, because we want to have Pr(X>=x) instead of Pr(X>x)
p.value
= 2.975069e-162
YES
Conclusion: The Invasive subtype is significantly enriched with Hypermethylated CpGs.

Q2: Is the Invasive subtype enriched with Hypomethylated CpGs?
q <- 625 # number of success (white balls) drawn
m <- 625+1263+284 # number of fail (black) in the urn
n <- 4110+1536+988 # number of success (white) in the urn
k <- 4110+625 # number of balls drawn out
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value
=1
No

Q3: Is the Proliferative subtype enriched with Hypermethylated CpGs?
q <- 1536
m <- 4110+1536+988
n <- 625+1263+284
k <- 1263+1536
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value
 = 1
No



Q4: Is the Proliferative subtype enriched with Hypomethylated CpGs?
q <- 1263
m <- 625+1263+284
n <- 4110+1536+988
k <- 1263+1536
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value
= 3.765696e-193
YES
Conclusion: The Proliferative subtype is significantly enriched with Hypomethylated CpGs.





Q5: Is the Metabolic subtype enriched with Hypermethylated CpGs?
q <- 988 # number of success (white balls) drawn
m <- 4110+1536+988 # number of success (white) in the urn
n <- 625+1263+284 # number of fail (black) in the urn
k <- 988+284 # number of balls drawn out
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value
= 0.01919936
YES
Conclusion: The Metabolic subtype is significantly enriched with Hypermethylated CpGs.




Q6: Is the Metabolic subtype enriched with Hypomethylated CpGs?
q <- 284 #
m <- 625+1263+284
n <- 4110+1536+988
k <- 988+284
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value

= 0.9839128
No






See also:
http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2724271/
An integrative genomics approach identifies Hypoxia Inducible Factor-1 (HIF-1)-target genes that form the core response to hypoxia

Friday, April 15, 2011

Median survival time

smed <- function(x) {

        ox <- capture.output(print(x))
        n <- length(ox)
        tmp <- t(sapply(ox[4:n],        
                function(l) strsplit(l, split=' +')[[1]]))
        nres <- strsplit(ox[3],split=' +')[[1]][2:6]
        res <- matrix(as.numeric(tmp[,2:6]), ncol=5,
                dimnames=list(tmp[,1], nres))
        res

}

# example:
#library(survival)
#fit1 <- survfit(Surv(time, status) ~ 1, data=aml)
#sf1 <- smed(fit1)
#sf1
#fit <- survfit(Surv(time, status) ~ x, data=aml)
#sf <- smed(fit)
#sf

http://tolstoy.newcastle.edu.au/R/help/06/05/26713.html

By Xiaochun Li

Wednesday, April 13, 2011

Change the $PATH

[leiz@136-25 ~]$ echo $PATH
/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/NX/bin:/home/leiz/bin:/home/leiz/NGS/bowtie:/home/leiz/NGS/bowtie/indexes:/home/leiz/NGS/tophat:/home/leiz/NGS/samtools:/home/leiz/NGS/cufflinks



[leiz@136-25 ~]$ more .bash_profile
# .bash_profile

# Get the aliases and functions
if [ -f ~/.bashrc ]; then
        . ~/.bashrc
fi

# User specific environment and startup programs

PATH=$PATH:$HOME/bin:/home/leiz/NGS/bowtie:/home/leiz/NGS/bowtie/indexes:/home/leiz/NGS/tophat:/home/leiz/NGS/samtools:$HOME/NGS/cufflinks


export PATH

vi
source .bash_profile

Tuesday, April 5, 2011

wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz >out.txt &

tar -zxvf chromFa.tar.gz
//remove random, unmapped, haplotype, chrM
rm *random*
rm *Un*
rm *hap*
rm chrM.fa

cat chr*.fa > hg19.fa
To check what chromosome inside:
grep ">" hg19.fa

//build ref genome index
cd /home/leiz/NGS/hg19
bwa index -a bwtsw /home/leiz/NGS/hg19/hg19.fa &
// use top -u leiz to check progress

//get RNA-seq raw data for AGS cell line
cd /home/leiz/NGS/GEMINI
scp NUSSTF\\gmslz@172.25.138.12:/data/public/RNA-seq_illumina_raw_data/YMM/AGS*  ./

//align by bwa
cd /home/leiz/NGS/GEMINI
bwa aln ../hg19/hg19.fa AGS_1_sequence.fastq >AGS_1.sai &
bwa aln ../hg19/hg19.fa AGS_2_sequence.fastq >AGS_2.sai &
./bwa sampe ../hg19/hg19.fa AGS_1.sai AGS_2.sai  AGS_1_sequence.fastq  AGS_2_sequence.fastq > AGS.pe.bwa.sam

//index hg19
cd /home/leiz/NGS/hg19
samtools faidx hg19.fa &


//sed 's/[ \t]*$//g' samfile
// import BWA alignment
cd /home/leiz/NGS/GEMINI

//sed 's/[ \t]*$//g' AGS.pe.bwa.sam >AGS.pe.bwa.sed.sam








//ref: http://seqanswers.com/forums/showthread.php?t=9650
//difference with samtools view -bt?

samtools import ../hg19/hg19.fa.fai AGS.pe.bwa.sed.sam AGS.pe.bwa.bam

//sort bam

samtools sort AGS.pe.bwa.bam AGS.pe.bwa.sort &

//remove PRC duplicates by picard, use AS=true so that picard knows the bam is sorted.

java -jar ../picard-tools-1.44/MarkDuplicates.jar I=AGS.pe.bwa.sort.bam O=AGS.pe.bwa.sort.rmdup.bam M=PCR_duplicates.M.txt REMOVE_DUPLICATES=true AS=true



//index sorted bam
samtools index AGS.pe.bwa.sort.rmdup.bam

//consensus calling
samtools pileup -cf ../hg19/hg19.fa AGS.pe.bwa.sort.rmdup.bam > AGS.pe.bwa.sort.rmdup.bam.pileup &


samtools pileup -cvf ../hg19/hg19.fa AGS.pe.bwa.sort.rmdup.bam > AGS.variants.pileup &


perl ../samtools/misc/samtools.pl varFilter AGS.variants.pileup | awk '$6>=20' > AGS.final.pileup





http://sourceforge.net/apps/mediawiki/samtools/index.php?title=SAM_FAQ

I want to call SNPs and short indels.



samtools pileup -vcf ref.fa aln.bam | tee raw.txt | samtools.pl varFilter -D100 > flt.txt
awk '($3=="*"&&$6>=50)||($3!="*"&&$6>=20)' flt.txt > final.txt





http://plindenbaum.blogspot.com/2010/04/bam-bwa-and-samtools-my-notebook.html

http://seqanswers.com/forums/showthread.php?t=8364

#reference is yeast.nt
#query is yeast.fasta

bwa index /home/jill/bwa-0.5.8c/Project/yeast.fasta 

bwa index /home/jill/bwa-0.5.8c/Project/yeast.nt

bwa aln /home/jill/bwa-0.5.8c/Project/yeast.nt /home/jill/bwa-0.5.8c/Project/yeast.fasta  > /home/jill/bwa-0.5.8c/Project/yeastbwaout.sai

#[bwa_aln] 17bp reads: max_diff = 2
#[bwa_aln] 38bp reads: max_diff = 3
#[bwa_aln] 64bp reads: max_diff = 4
#[bwa_aln] 93bp reads: max_diff = 5
#[bwa_aln] 124bp reads: max_diff = 6
#[bwa_aln] 157bp reads: max_diff = 7
#[bwa_aln] 190bp reads: max_diff = 8
#[bwa_aln] 225bp reads: max_diff = 9
#[bwa_aln_core] calculate SA coordinate... 0.15 sec
#[bwa_aln_core] write to the disk... 0.00 sec
#[bwa_aln_core] 1 sequences have been processed.

bwa samse /home/jill/bwa-0.5.8c/Project/yeast.nt  /home/jill/bwa-0.5.8c/Project/yeastbwaout.sai /home/jill/bwa-0.5.8c/Project/yeast.fasta  > /home/jill/bwa-0.5.8c/Project/yeastbwaout.sam 

#[bwa_aln_core] convert to sequence coordinate... 0.01 sec
#[bwa_aln_core] refine gapped alignments... 0.00 sec
#[bwa_aln_core] print alignments... 0.01 sec
#[bwa_aln_core] 1 sequences have been processed.

samtools faidx /home/jill/bwa-0.5.8c/Project/yeast.nt

samtools view -bt /home/jill/bwa-0.5.8c/Project/yeast.nt.fai /home/jill/bwa-0.5.8c/Project/yeastbwaout.sam > /home/jill/bwa-0.5.8c/Project/yeastbwaout.bam

#[samopen] SAM header is present: 17 sequences.

#index bam
samtools index /home/jill/bwa-0.5.8c/Project/yeastbwaout.bam

#Sorts bam
samtools sort /home/jill/bwa-0.5.8c/Project/yeastbwaout.bam /home/jill/bwa-0.5.8c/Project/yeastbwaoutsort 

samtools index /home/jill/bwa-0.5.8c/Project/yeastbwaoutsort.bam

samtools pileup -vcf /home/jill/bwa-0.5.8c/Project/yeast.nt.fai /home/jill/bwa-0.5.8c/Project/yeastbwaoutsort.bam | tee /home/jill/bwa-0.5.8c/Project/raw.txt | /home/jill/samtools-0.1.11/misc/samtools.pl varFilter -D100 > /home/jill/bwa-0.5.8c/Project/flt.txt
awk '($3=="*"&&$6>=50)||($3!="*"&&$6>=20)' /home/jill/bwa-0.5.8c/Project/flt.txt > /home/jill/bwa-0.5.8c/Project/final.txt

Monday, April 4, 2011

Batch effect

Several methods have been proposed that can adjust for batch effects provided a large number of samples (> 25) are included in each batch[1,2] .

ComBat: an empirical Bayes method has been described [3] that adjusts for batch effects even when the number of samples in each batch is small (< 10)

The aforementioned methods can adjust for batch effects provided that samples from "each biological group" are represented in every batch.

"each biological group" means that in each batch, each biological phenotype must have at least one sample in each batch, for example, each batch must have both disease and control, or each batch must have cancer subtypes “invasive”, “proliferative”, and “metabolic”

Suppose "metabolic" subtype consist of 1/3 in all cancer patients.
e.g. 1 million in 3 million cancer patients.
Then for each batch, in general, you have at least 6 tumors, thus you have 0.91 probability to have at least one metabolic sample in it.



use lib "./";
use Stat;
my $p = Stat::phyper_enrichment(3E6, 1E6, 6, 1);
print $p;
------

6 -> 91%
7 -> 94%
8 -> 96%



Reference:

Empirical Bayes accomodation of batch-effects in microarray data using identical replicate reference samples: application to RNA expression profiling of blood from Duchenne muscular dystrophy patients
[1] Alter O, Brown PO, Botstein D. Singular value decomposition for genome-wide expression data processing and modeling. Proc Natl Acad Sci USA. 2000;97:10101–10106. doi: 10.1073/pnas.97.18.10101. [PMC free article] [PubMed] [Cross Ref]
[2] Benito M, Parker J, Du Q, Wu J, Xiang D, Perou CM, Marron JS. Adjustment of systematic microarray data biases. Bioinformatics. 2004;20:105–114. doi: 10.1093/bioinformatics/btg385. [PubMed] [Cross Ref]
[3] Johnson WE, Li C, Rabinovic A. Adjusting batch effects in microarray expression data using empirical Bayes methods. Biostatistics. 2007;8:118–127. doi: 10.1093/biostatistics/kxj037.[PubMed] [Cross Ref]

Saturday, April 2, 2011

Set up NGS aligner


1. MAQ











cd /home/leiz/NGS/downloads
wget http://sourceforge.net/projects/maq/files/maq/0.7.1/maq-0.7.1.tar.bz2/download
tar -jxvf maq-0.7.1.tar.bz2
cd maq-0.7.1
./configure --prefix=/home/leiz/NGS/maq-0.7.1
make
make install



2. BWA
source: http://bio-bwa.sourceforge.net/

cd /home/leiz/NGS
wget http://sourceforge.net/projects/bio-bwa/files/bwa-0.5.9.tar.bz2/download
tar -jxvf bwa-0.5.9.tar.bz2
cd bwa-0.5.9
make



3. bowtie
source: http://bowtie-bio.sourceforge.net/index.shtml


cd /home/leiz/NGS
wget http://sourceforge.net/projects/bowtie-bio/files/bowtie/0.12.7/bowtie-0.12.7-linux-x86_64.zip/download
unzip bowtie-0.12.7-linux-x86_64.zip 
bowtie-0.12.7
./bowtie


#copy hg19.fa to bowtie index folder
cp ./hg19.fa ../bowtie-0.12.7/indexes/
 nohup bowtie-build ./hg19.fa ../bowtie-0.12.7/indexes/hg19 >screen.txt &
cd ../bowtie-0.12.7/indexes/hg19


bowtie-inspect --summary hg19
vi $HOME/.bashrc 
# User specific aliases and functions
export BOWTIE_INDEXES=/home/leiz/NGS/bowtie-0.12.7/indexes/


source $HOME/.bashrc 
 bowtie -c hg19 ATGACGGCGGCCGAGAAGTTGGATTTCTTGGT

nohup tophat -r 200  --num-threads=4 hg19 AGS_1_sequence.fastq  AGS_2_sequence.fastq >screen.txt &


4. samtools










cd /home/leiz/NGS/downloads
wget http://sourceforge.net/projects/samtools/files/samtools/0.1.16/samtools-0.1.16.tar.bz2/download
tar -jxvf samtools-0.1.16.tar.bz2
cd  samtools-0.1.16
mv samtools-0.1.16 .. 
make


5. picard
java 1.6 need: http://www.java.com/en/download/manual.jsp
cd /home/leiz/java

wget http://javadl.sun.com/webapps/download/AutoDL?BundleId=48338 -O jre-6u25-linux-x64.bin






chmod a+x jre-6u25-linux-x64.bin
./





jre-6u25-linux-x64.bin



add java path /home/leiz/java/jre1.6.0_25/bin to $PATH in .bash_profile
source .bash_profile
java -version










cd /home/leiz/NGS/downloads
wget http://sourceforge.net/projects/picard/files/picard-tools/1.44/picard-tools-1.44.zip/download
unzip picard-tools-1.44.zipmv picard-tools-1.44 .. 


6. cufflinks

http://sourceforge.net/projects/boost/files/boost/1.46.1/boost_1_46_1.zip/download
wget http://cufflinks.cbcb.umd.edu/downloads/cufflinks-0.9.3.Linux_x86_64.tar.gz
http://cufflinks.cbcb.umd.edu/downloads/test_data.sam
cufflinks ./test_data.sam

Check batch effect in your affy gene expression data

###############################################3
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 12 colors (12 different days)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011

## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")


rm(list=ls())

wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)

 read.from.txt = 1
 genes <- read.table('GEMINI37.rma.txt', header=T)
# read.from.txt = 0
#  library(affy)
#  data <- ReadAffy()
#  genes <- as.matrix(exprs(rma(data)))
#  ## Or you can try mas5
#  #genes <- log2(as.matrix(exprs(mas5(data))))


genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)


library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]




files.info <- data.frame()
cel.files <- list.files(wk.dir, pattern = ".CEL", ignore.case = T)
for (i in 1:length(cel.files))
{
 cel.file <- paste(wk.dir, cel.files[i], sep="\\")
 cel.header <- readLines(cel.file, 20)

 datheader.line <- 0
 for (line in 1:length(cel.header))
 {
  if ( substr(cel.header[line], 1, 9)=="DatHeader" )
  {
   datheader.line <- line
   break
  }
 }

 datheader.line

 header.fields <- strsplit(cel.header[datheader.line], " ")

 cel.filename <- strsplit(header.fields[[1]][3], ":")[[1]][1]
 exp.date <- header.fields[[1]][18]
 exp.time <- header.fields[[1]][19]

if (read.from.txt) {
cel.files[i] <- gsub("-", ".", cel.files[i])
cel.files[i] <- gsub("\\+", ".", cel.files[i])
cel.files[i] <- gsub("\\(", ".", cel.files[i])
cel.files[i] <- gsub("\\)", ".", cel.files[i])

if(length(grep("^[0-9]", cel.files[i]))) cel.files[i]<- paste("X", cel.files[i], sep="")
}

 files.info[cel.files[i],1] <- cel.filename
 files.info[cel.files[i],2] <- exp.date
 files.info[cel.files[i],3] <- exp.time
 files.info[cel.files[i],4] <- "#000000"

}
colnames(files.info) <- c("SAMPLE_ID", "EXP_DATE", "EXP_TIME", "COLOR")
files.info






#files.info[with(files.info, order(EXP_DATE, EXP_TIME)), ]
#files.info[order(chron(names(files.info)))]





date.factor <- as.factor(files.info$EXP_DATE)
date.levels <- levels(date.factor)
nlevels(date.factor)
###########################
# Generate a vector of colors that are as different from one another as possible
barvy<-rainbow(12)
vyber<-c(1,7,3,9,5,11,2,8,4,10,6,12)
color.list <- c("red", "yellow", "green", "cyan", "blue", "purple")

num.of.colors <- nlevels(date.factor)
for (c in 1:num.of.colors)
{
 files.info$COLOR[which(files.info$EXP_DATE==date.levels[c])] <- color.list[c]
 num.in.batch <- length(which(files.info$EXP_DATE==date.levels[c]))
 date.levels[c] <- paste(date.levels[c], num.in.batch, sep="   n=")
}

files.info

#rownames(genes)
write.table(files.info, file="files.info.user.batch.txt",sep="\t")




group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]




par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)

#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 6)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)

legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)

}

text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)





pdf(file="Batch_Effect_Date.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 12)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)

legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)

}

text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)


dev.off()
print(paste("The figure is saved as ", wk.dir, "Batch_Effect_Date.pdf ", sep="\\"))

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////



###############################################3
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 7 colors (red, yellow, green, cyn, blue, purple, black)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011

## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")


rm(list=ls())
wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)
genes <- read.table('GEMINI37.rma.txt', header=T)


genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)





files.info <- data.frame()
files.info <- read.table("files.info.user.batch.txt", header=T, sep="\t")
rownames(files.info) <- files.info[,1]
head(files.info)


library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]

#rownames(genes)



group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]



par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)



pdf(file="Batch_Effect_Xbatches.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
dev.off()