Zhengdeng Lei, PhD
Zhengdeng Lei, PhD
2007 - 2009 High Throughput Computational Analyst, Memorial Sloan-Kettering Cancer Center, New York
2003 - 2007 PhD, Bioinformatics, University of Illinois at Chicago
Tuesday, April 26, 2011
Median Centered
data.log10.MeanCentered <- sweep(data.log10, 2, apply(data.log10, 2, median))
Tuesday, April 19, 2011
Enrichment test in cancer subtypes
Invasive | Proliferative | Metabolic | |
Hypermethylated | 4110 | 1536 | 988 |
Hypomethylated | 625 | 1263 | 284 |
Q1: Is the Invasive subtype enriched with Hypermethylated CpGs?
q <- 4110 # number of success (white balls) drawn
m <- 4110+1536+988 # number of success (white) in the urn
n <- 625+1263+284 # number of fail (black) in the urn
k <- 4110+625 # number of balls drawn out
p.value <- phyper(q-1,m,n,k, lower.tail=F)
# why q-1, because we want to have Pr(X>=x) instead of Pr(X>x)
p.value
= 2.975069e-162
YES
Conclusion: The Invasive subtype is significantly enriched with Hypermethylated CpGs.
Q2: Is the Invasive subtype enriched with Hypomethylated CpGs?
q <- 625 # number of success (white balls) drawn
m <- 625+1263+284 # number of fail (black) in the urn
n <- 4110+1536+988 # number of success (white) in the urn
k <- 4110+625 # number of balls drawn out
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value
=1
No
Q3: Is the Proliferative subtype enriched with Hypermethylated CpGs?
q <- 1536
m <- 4110+1536+988
n <- 625+1263+284
k <- 1263+1536
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value
= 1
No
Q4: Is the Proliferative subtype enriched with Hypomethylated CpGs?
q <- 1263
m <- 625+1263+284
n <- 4110+1536+988
k <- 1263+1536
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value
= 3.765696e-193
YES
Conclusion: The Proliferative subtype is significantly enriched with Hypomethylated CpGs.
Q5: Is the Metabolic subtype enriched with Hypermethylated CpGs?
q <- 988 # number of success (white balls) drawn
m <- 4110+1536+988 # number of success (white) in the urn
n <- 625+1263+284 # number of fail (black) in the urn
k <- 988+284 # number of balls drawn out
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value
= 0.01919936
YES
Conclusion: The Metabolic subtype is significantly enriched with Hypermethylated CpGs.
Q6: Is the Metabolic subtype enriched with Hypomethylated CpGs?
q <- 284 #
m <- 625+1263+284
n <- 4110+1536+988
k <- 988+284
p.value <- phyper(q-1,m,n,k, lower.tail=F)
p.value
= 0.9839128
No
See also:
http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2724271/
An integrative genomics approach identifies Hypoxia Inducible Factor-1 (HIF-1)-target genes that form the core response to hypoxia
Friday, April 15, 2011
Median survival time
smed <- function(x) {
ox <- capture.output(print(x))
n <- length(ox)
tmp <- t(sapply(ox[4:n],
function(l) strsplit(l, split=' +')[[1]]))
nres <- strsplit(ox[3],split=' +')[[1]][2:6]
res <- matrix(as.numeric(tmp[,2:6]), ncol=5,
dimnames=list(tmp[,1], nres))
res
}
# example:
#library(survival)
#fit1 <- survfit(Surv(time, status) ~ 1, data=aml)
#sf1 <- smed(fit1)
#sf1
#fit <- survfit(Surv(time, status) ~ x, data=aml)
#sf <- smed(fit)
#sf
http://tolstoy.newcastle.edu.au/R/help/06/05/26713.html
By Xiaochun Li
ox <- capture.output(print(x))
n <- length(ox)
tmp <- t(sapply(ox[4:n],
function(l) strsplit(l, split=' +')[[1]]))
nres <- strsplit(ox[3],split=' +')[[1]][2:6]
res <- matrix(as.numeric(tmp[,2:6]), ncol=5,
dimnames=list(tmp[,1], nres))
res
}
# example:
#library(survival)
#fit1 <- survfit(Surv(time, status) ~ 1, data=aml)
#sf1 <- smed(fit1)
#sf1
#fit <- survfit(Surv(time, status) ~ x, data=aml)
#sf <- smed(fit)
#sf
http://tolstoy.newcastle.edu.au/R/help/06/05/26713.html
By Xiaochun Li
Wednesday, April 13, 2011
Change the $PATH
[leiz@136-25 ~]$ echo $PATH
/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/NX/bin:/home/leiz/bin:/home/leiz/NGS/bowtie:/home/leiz/NGS/bowtie/indexes:/home/leiz/NGS/tophat:/home/leiz/NGS/samtools:/home/leiz/NGS/cufflinks
[leiz@136-25 ~]$ more .bash_profile
# .bash_profile
# Get the aliases and functions
if [ -f ~/.bashrc ]; then
. ~/.bashrc
fi
# User specific environment and startup programs
PATH=$PATH:$HOME/bin:/home/leiz/NGS/bowtie:/home/leiz/NGS/bowtie/indexes:/home/leiz/NGS/tophat:/home/leiz/NGS/samtools:$HOME/NGS/cufflinks
export PATH
vi
source .bash_profile
/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/NX/bin:/home/leiz/bin:/home/leiz/NGS/bowtie:/home/leiz/NGS/bowtie/indexes:/home/leiz/NGS/tophat:/home/leiz/NGS/samtools:/home/leiz/NGS/cufflinks
[leiz@136-25 ~]$ more .bash_profile
# .bash_profile
# Get the aliases and functions
if [ -f ~/.bashrc ]; then
. ~/.bashrc
fi
# User specific environment and startup programs
PATH=$PATH:$HOME/bin:/home/leiz/NGS/bowtie:/home/leiz/NGS/bowtie/indexes:/home/leiz/NGS/tophat:/home/leiz/NGS/samtools:$HOME/NGS/cufflinks
export PATH
vi
source .bash_profile
Thursday, April 7, 2011
Tuesday, April 5, 2011
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz >out.txt &
tar -zxvf chromFa.tar.gz
//remove random, unmapped, haplotype, chrM
rm *random*
rm *Un*
rm *hap*
rm chrM.fa
cat chr*.fa > hg19.fa
To check what chromosome inside:
grep ">" hg19.fa
//build ref genome index
cd /home/leiz/NGS/hg19
bwa index -a bwtsw /home/leiz/NGS/hg19/hg19.fa &
// use top -u leiz to check progress
//get RNA-seq raw data for AGS cell line
cd /home/leiz/NGS/GEMINI
scp NUSSTF\\gmslz@172.25.138.12:/data/public/RNA-seq_illumina_raw_data/YMM/AGS* ./
//align by bwa
cd /home/leiz/NGS/GEMINI
bwa aln ../hg19/hg19.fa AGS_1_sequence.fastq >AGS_1.sai &
//sed 's/[ \t]*$//g' samfile
// import BWA alignment
cd /home/leiz/NGS/GEMINI
//sed 's/[ \t]*$//g' AGS.pe.bwa.sam >AGS.pe.bwa.sed.sam
samtools import ../hg19/hg19.fa.fai AGS.pe.bwa.sed.sam AGS.pe.bwa.bam
//sort bam
samtools sort AGS.pe.bwa.bam AGS.pe.bwa.sort &
//remove PRC duplicates by picard, use AS=true so that picard knows the bam is sorted.
java -jar ../picard-tools-1.44/MarkDuplicates.jar I=AGS.pe.bwa.sort.bam O=AGS.pe.bwa.sort.rmdup.bam M=PCR_duplicates.M.txt REMOVE_DUPLICATES=true AS=true
//index sorted bam
samtools index AGS.pe.bwa.sort.rmdup.bam
//consensus calling
samtools pileup -cf ../hg19/hg19.fa AGS.pe.bwa.sort.rmdup.bam > AGS.pe.bwa.sort.rmdup.bam.pileup &
perl ../samtools/misc/samtools.pl varFilter AGS.variants.pileup | awk '$6>=20' > AGS.final.pileup
http://plindenbaum.blogspot.com/2010/04/bam-bwa-and-samtools-my-notebook.html
http://seqanswers.com/forums/showthread.php?t=8364
tar -zxvf chromFa.tar.gz
//remove random, unmapped, haplotype, chrM
rm *random*
rm *Un*
rm *hap*
rm chrM.fa
cat chr*.fa > hg19.fa
To check what chromosome inside:
grep ">" hg19.fa
//build ref genome index
cd /home/leiz/NGS/hg19
bwa index -a bwtsw /home/leiz/NGS/hg19/hg19.fa &
// use top -u leiz to check progress
//get RNA-seq raw data for AGS cell line
cd /home/leiz/NGS/GEMINI
scp NUSSTF\\gmslz@172.25.138.12:/data/public/RNA-seq_illumina_raw_data/YMM/AGS* ./
//align by bwa
cd /home/leiz/NGS/GEMINI
bwa aln ../hg19/hg19.fa AGS_1_sequence.fastq >AGS_1.sai &
bwa aln ../hg19/hg19.fa AGS_2_sequence.fastq >AGS_2.sai &
./bwa sampe ../hg19/hg19.fa AGS_1.sai AGS_2.sai AGS_1_sequence.fastq AGS_2_sequence.fastq > AGS.pe.bwa.sam
//index hg19
cd /home/leiz/NGS/hg19
samtools faidx hg19.fa &
//index hg19
cd /home/leiz/NGS/hg19
samtools faidx hg19.fa &
//sed 's/[ \t]*$//g' samfile
// import BWA alignment
cd /home/leiz/NGS/GEMINI
//sed 's/[ \t]*$//g' AGS.pe.bwa.sam >AGS.pe.bwa.sed.sam
//ref: http://seqanswers.com/forums/showthread.php?t=9650
//difference with samtools view -bt?
//difference with samtools view -bt?
samtools import ../hg19/hg19.fa.fai AGS.pe.bwa.sed.sam AGS.pe.bwa.bam
//sort bam
samtools sort AGS.pe.bwa.bam AGS.pe.bwa.sort &
//remove PRC duplicates by picard, use AS=true so that picard knows the bam is sorted.
java -jar ../picard-tools-1.44/MarkDuplicates.jar I=AGS.pe.bwa.sort.bam O=AGS.pe.bwa.sort.rmdup.bam M=PCR_duplicates.M.txt REMOVE_DUPLICATES=true AS=true
//index sorted bam
samtools index AGS.pe.bwa.sort.rmdup.bam
//consensus calling
samtools pileup -cf ../hg19/hg19.fa AGS.pe.bwa.sort.rmdup.bam > AGS.pe.bwa.sort.rmdup.bam.pileup &
samtools pileup -cvf ../hg19/hg19.fa AGS.pe.bwa.sort.rmdup.bam > AGS.variants.pileup &
perl ../samtools/misc/samtools.pl varFilter AGS.variants.pileup | awk '$6>=20' > AGS.final.pileup
http://sourceforge.net/apps/mediawiki/samtools/index.php?title=SAM_FAQ
I want to call SNPs and short indels.
samtools pileup -vcf ref.fa aln.bam | tee raw.txt | samtools.pl varFilter -D100 > flt.txt awk '($3=="*"&&$6>=50)||($3!="*"&&$6>=20)' flt.txt > final.txt
http://plindenbaum.blogspot.com/2010/04/bam-bwa-and-samtools-my-notebook.html
http://seqanswers.com/forums/showthread.php?t=8364
#reference is yeast.nt #query is yeast.fasta bwa index /home/jill/bwa-0.5.8c/Project/yeast.fasta bwa index /home/jill/bwa-0.5.8c/Project/yeast.nt bwa aln /home/jill/bwa-0.5.8c/Project/yeast.nt /home/jill/bwa-0.5.8c/Project/yeast.fasta > /home/jill/bwa-0.5.8c/Project/yeastbwaout.sai #[bwa_aln] 17bp reads: max_diff = 2 #[bwa_aln] 38bp reads: max_diff = 3 #[bwa_aln] 64bp reads: max_diff = 4 #[bwa_aln] 93bp reads: max_diff = 5 #[bwa_aln] 124bp reads: max_diff = 6 #[bwa_aln] 157bp reads: max_diff = 7 #[bwa_aln] 190bp reads: max_diff = 8 #[bwa_aln] 225bp reads: max_diff = 9 #[bwa_aln_core] calculate SA coordinate... 0.15 sec #[bwa_aln_core] write to the disk... 0.00 sec #[bwa_aln_core] 1 sequences have been processed. bwa samse /home/jill/bwa-0.5.8c/Project/yeast.nt /home/jill/bwa-0.5.8c/Project/yeastbwaout.sai /home/jill/bwa-0.5.8c/Project/yeast.fasta > /home/jill/bwa-0.5.8c/Project/yeastbwaout.sam #[bwa_aln_core] convert to sequence coordinate... 0.01 sec #[bwa_aln_core] refine gapped alignments... 0.00 sec #[bwa_aln_core] print alignments... 0.01 sec #[bwa_aln_core] 1 sequences have been processed. samtools faidx /home/jill/bwa-0.5.8c/Project/yeast.nt samtools view -bt /home/jill/bwa-0.5.8c/Project/yeast.nt.fai /home/jill/bwa-0.5.8c/Project/yeastbwaout.sam > /home/jill/bwa-0.5.8c/Project/yeastbwaout.bam #[samopen] SAM header is present: 17 sequences. #index bam samtools index /home/jill/bwa-0.5.8c/Project/yeastbwaout.bam #Sorts bam samtools sort /home/jill/bwa-0.5.8c/Project/yeastbwaout.bam /home/jill/bwa-0.5.8c/Project/yeastbwaoutsort samtools index /home/jill/bwa-0.5.8c/Project/yeastbwaoutsort.bam samtools pileup -vcf /home/jill/bwa-0.5.8c/Project/yeast.nt.fai /home/jill/bwa-0.5.8c/Project/yeastbwaoutsort.bam | tee /home/jill/bwa-0.5.8c/Project/raw.txt | /home/jill/samtools-0.1.11/misc/samtools.pl varFilter -D100 > /home/jill/bwa-0.5.8c/Project/flt.txt awk '($3=="*"&&$6>=50)||($3!="*"&&$6>=20)' /home/jill/bwa-0.5.8c/Project/flt.txt > /home/jill/bwa-0.5.8c/Project/final.txt
Monday, April 4, 2011
Batch effect
Several methods have been proposed that can adjust for batch effects provided a large number of samples (> 25) are included in each batch[1,2] .
ComBat: an empirical Bayes method has been described [3] that adjusts for batch effects even when the number of samples in each batch is small (< 10)
The aforementioned methods can adjust for batch effects provided that samples from "each biological group" are represented in every batch.
"each biological group" means that in each batch, each biological phenotype must have at least one sample in each batch, for example, each batch must have both disease and control, or each batch must have cancer subtypes “invasive”, “proliferative”, and “metabolic”
Suppose "metabolic" subtype consist of 1/3 in all cancer patients.
e.g. 1 million in 3 million cancer patients.
Then for each batch, in general, you have at least 6 tumors, thus you have 0.91 probability to have at least one metabolic sample in it.
use lib "./";
use Stat;
my $p = Stat::phyper_enrichment(3E6, 1E6, 6, 1);
print $p;
------
6 -> 91%
7 -> 94%
8 -> 96%
Reference:
e.g. 1 million in 3 million cancer patients.
Then for each batch, in general, you have at least 6 tumors, thus you have 0.91 probability to have at least one metabolic sample in it.
use lib "./";
use Stat;
my $p = Stat::phyper_enrichment(3E6, 1E6, 6, 1);
print $p;
------
6 -> 91%
7 -> 94%
8 -> 96%
Reference:
Empirical Bayes accomodation of batch-effects in microarray data using identical replicate reference samples: application to RNA expression profiling of blood from Duchenne muscular dystrophy patients
[1] Alter O, Brown PO, Botstein D. Singular value decomposition for genome-wide expression data processing and modeling. Proc Natl Acad Sci USA. 2000;97:10101–10106. doi: 10.1073/pnas.97.18.10101. [PMC free article] [PubMed] [Cross Ref]
Saturday, April 2, 2011
Set up NGS aligner
1. MAQ
cd /home/leiz/NGS/downloads
wget http://sourceforge.net/projects/maq/files/maq/0.7.1/maq-0.7.1.tar.bz2/download
tar -jxvf maq-0.7.1.tar.bz2
cd maq-0.7.1
./configure --prefix=/home/leiz/NGS/maq-0.7.1
make
make install
2. BWA
source: http://bio-bwa.sourceforge.net/
cd /home/leiz/NGS
wget http://sourceforge.net/projects/bio-bwa/files/bwa-0.5.9.tar.bz2/download
tar -jxvf bwa-0.5.9.tar.bz2
cd bwa-0.5.9
make
3. bowtie
source: http://bowtie-bio.sourceforge.net/index.shtml
cd /home/leiz/NGS
wget http://sourceforge.net/projects/bowtie-bio/files/bowtie/0.12.7/bowtie-0.12.7-linux-x86_64.zip/download
unzip bowtie-0.12.7-linux-x86_64.zip
bowtie-0.12.7
./bowtie
#copy hg19.fa to bowtie index folder
cp ./hg19.fa ../bowtie-0.12.7/indexes/
nohup bowtie-build ./hg19.fa ../bowtie-0.12.7/indexes/hg19 >screen.txt &
bowtie-inspect --summary hg19
bowtie -c hg19 ATGACGGCGGCCGAGAAGTTGGATTTCTTGGT
nohup tophat -r 200 --num-threads=4 hg19 AGS_1_sequence.fastq AGS_2_sequence.fastq >screen.txt &
cd /home/leiz/NGS/downloads
wget http://sourceforge.net/projects/samtools/files/samtools/0.1.16/samtools-0.1.16.tar.bz2/download
tar -jxvf samtools-0.1.16.tar.bz2
cd samtools-0.1.16
mv samtools-0.1.16 ..
make
cd /home/leiz/NGS/downloads
wget http://sourceforge.net/projects/picard/files/picard-tools/1.44/picard-tools-1.44.zip/download
unzip picard-tools-1.44.zipmv picard-tools-1.44 ..
6. cufflinks
http://sourceforge.net/projects/boost/files/boost/1.46.1/boost_1_46_1.zip/download
wget http://cufflinks.cbcb.umd.edu/downloads/cufflinks-0.9.3.Linux_x86_64.tar.gz
http://cufflinks.cbcb.umd.edu/downloads/test_data.sam
cufflinks ./test_data.sam
#copy hg19.fa to bowtie index folder
cp ./hg19.fa ../bowtie-0.12.7/indexes/
nohup bowtie-build ./hg19.fa ../bowtie-0.12.7/indexes/hg19 >screen.txt &
cd ../bowtie-0.12.7/indexes/hg19
bowtie-inspect --summary hg19
vi $HOME/.bashrc
# User specific aliases and functions
export BOWTIE_INDEXES=/home/leiz/NGS/bowtie-0.12.7/indexes/
source $HOME/.bashrc
nohup tophat -r 200 --num-threads=4 hg19 AGS_1_sequence.fastq AGS_2_sequence.fastq >screen.txt &
4. samtools
cd /home/leiz/NGS/downloads
wget http://sourceforge.net/projects/samtools/files/samtools/0.1.16/samtools-0.1.16.tar.bz2/download
tar -jxvf samtools-0.1.16.tar.bz2
cd samtools-0.1.16
mv samtools-0.1.16 ..
make
5. picard
java 1.6 need: http://www.java.com/en/download/manual.jsp
cd /home/leiz/java
wget http://javadl.sun.com/webapps/download/AutoDL?BundleId=48338 -O jre-6u25-linux-x64.bin
add java path /home/leiz/java/jre1.6.0_25/bin to $PATH in .bash_profile
source .bash_profile
java -version
java 1.6 need: http://www.java.com/en/download/manual.jsp
cd /home/leiz/java
wget http://javadl.sun.com/webapps/download/AutoDL?BundleId=48338 -O jre-6u25-linux-x64.bin
chmod a+x jre-6u25-linux-x64.bin
./
jre-6u25-linux-x64.bin
add java path /home/leiz/java/jre1.6.0_25/bin to $PATH in .bash_profile
source .bash_profile
java -version
cd /home/leiz/NGS/downloads
wget http://sourceforge.net/projects/picard/files/picard-tools/1.44/picard-tools-1.44.zip/download
unzip picard-tools-1.44.zipmv picard-tools-1.44 ..
http://sourceforge.net/projects/boost/files/boost/1.46.1/boost_1_46_1.zip/download
wget http://cufflinks.cbcb.umd.edu/downloads/cufflinks-0.9.3.Linux_x86_64.tar.gz
http://cufflinks.cbcb.umd.edu/downloads/test_data.sam
cufflinks ./test_data.sam
Check batch effect in your affy gene expression data
###############################################3
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 12 colors (12 different days)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011
## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")
rm(list=ls())
wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)
read.from.txt = 1
genes <- read.table('GEMINI37.rma.txt', header=T)
# read.from.txt = 0
# library(affy)
# data <- ReadAffy()
# genes <- as.matrix(exprs(rma(data)))
# ## Or you can try mas5
# #genes <- log2(as.matrix(exprs(mas5(data))))
genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)
library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]
files.info <- data.frame()
cel.files <- list.files(wk.dir, pattern = ".CEL", ignore.case = T)
for (i in 1:length(cel.files))
{
cel.file <- paste(wk.dir, cel.files[i], sep="\\")
cel.header <- readLines(cel.file, 20)
datheader.line <- 0
for (line in 1:length(cel.header))
{
if ( substr(cel.header[line], 1, 9)=="DatHeader" )
{
datheader.line <- line
break
}
}
datheader.line
header.fields <- strsplit(cel.header[datheader.line], " ")
cel.filename <- strsplit(header.fields[[1]][3], ":")[[1]][1]
exp.date <- header.fields[[1]][18]
exp.time <- header.fields[[1]][19]
if (read.from.txt) {
cel.files[i] <- gsub("-", ".", cel.files[i])
cel.files[i] <- gsub("\\+", ".", cel.files[i])
cel.files[i] <- gsub("\\(", ".", cel.files[i])
cel.files[i] <- gsub("\\)", ".", cel.files[i])
if(length(grep("^[0-9]", cel.files[i]))) cel.files[i]<- paste("X", cel.files[i], sep="")
}
files.info[cel.files[i],1] <- cel.filename
files.info[cel.files[i],2] <- exp.date
files.info[cel.files[i],3] <- exp.time
files.info[cel.files[i],4] <- "#000000"
}
colnames(files.info) <- c("SAMPLE_ID", "EXP_DATE", "EXP_TIME", "COLOR")
files.info
#files.info[with(files.info, order(EXP_DATE, EXP_TIME)), ]
#files.info[order(chron(names(files.info)))]
date.factor <- as.factor(files.info$EXP_DATE)
date.levels <- levels(date.factor)
nlevels(date.factor)
###########################
# Generate a vector of colors that are as different from one another as possible
barvy<-rainbow(12)
vyber<-c(1,7,3,9,5,11,2,8,4,10,6,12)
color.list <- c("red", "yellow", "green", "cyan", "blue", "purple")
num.of.colors <- nlevels(date.factor)
for (c in 1:num.of.colors)
{
files.info$COLOR[which(files.info$EXP_DATE==date.levels[c])] <- color.list[c]
num.in.batch <- length(which(files.info$EXP_DATE==date.levels[c]))
date.levels[c] <- paste(date.levels[c], num.in.batch, sep=" n=")
}
files.info
#rownames(genes)
write.table(files.info, file="files.info.user.batch.txt",sep="\t")
group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 6)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
}
text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)
pdf(file="Batch_Effect_Date.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 12)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
}
text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)
dev.off()
print(paste("The figure is saved as ", wk.dir, "Batch_Effect_Date.pdf ", sep="\\"))
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
###############################################3
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 7 colors (red, yellow, green, cyn, blue, purple, black)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011
## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")
rm(list=ls())
wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)
genes <- read.table('GEMINI37.rma.txt', header=T)
genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)
files.info <- data.frame()
files.info <- read.table("files.info.user.batch.txt", header=T, sep="\t")
rownames(files.info) <- files.info[,1]
head(files.info)
library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]
#rownames(genes)
group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
pdf(file="Batch_Effect_Xbatches.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
dev.off()
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 12 colors (12 different days)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011
## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")
rm(list=ls())
wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)
read.from.txt = 1
genes <- read.table('GEMINI37.rma.txt', header=T)
# read.from.txt = 0
# library(affy)
# data <- ReadAffy()
# genes <- as.matrix(exprs(rma(data)))
# ## Or you can try mas5
# #genes <- log2(as.matrix(exprs(mas5(data))))
genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)
library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]
files.info <- data.frame()
cel.files <- list.files(wk.dir, pattern = ".CEL", ignore.case = T)
for (i in 1:length(cel.files))
{
cel.file <- paste(wk.dir, cel.files[i], sep="\\")
cel.header <- readLines(cel.file, 20)
datheader.line <- 0
for (line in 1:length(cel.header))
{
if ( substr(cel.header[line], 1, 9)=="DatHeader" )
{
datheader.line <- line
break
}
}
datheader.line
header.fields <- strsplit(cel.header[datheader.line], " ")
cel.filename <- strsplit(header.fields[[1]][3], ":")[[1]][1]
exp.date <- header.fields[[1]][18]
exp.time <- header.fields[[1]][19]
if (read.from.txt) {
cel.files[i] <- gsub("-", ".", cel.files[i])
cel.files[i] <- gsub("\\+", ".", cel.files[i])
cel.files[i] <- gsub("\\(", ".", cel.files[i])
cel.files[i] <- gsub("\\)", ".", cel.files[i])
if(length(grep("^[0-9]", cel.files[i]))) cel.files[i]<- paste("X", cel.files[i], sep="")
}
files.info[cel.files[i],1] <- cel.filename
files.info[cel.files[i],2] <- exp.date
files.info[cel.files[i],3] <- exp.time
files.info[cel.files[i],4] <- "#000000"
}
colnames(files.info) <- c("SAMPLE_ID", "EXP_DATE", "EXP_TIME", "COLOR")
files.info
#files.info[with(files.info, order(EXP_DATE, EXP_TIME)), ]
#files.info[order(chron(names(files.info)))]
date.factor <- as.factor(files.info$EXP_DATE)
date.levels <- levels(date.factor)
nlevels(date.factor)
###########################
# Generate a vector of colors that are as different from one another as possible
barvy<-rainbow(12)
vyber<-c(1,7,3,9,5,11,2,8,4,10,6,12)
color.list <- c("red", "yellow", "green", "cyan", "blue", "purple")
num.of.colors <- nlevels(date.factor)
for (c in 1:num.of.colors)
{
files.info$COLOR[which(files.info$EXP_DATE==date.levels[c])] <- color.list[c]
num.in.batch <- length(which(files.info$EXP_DATE==date.levels[c]))
date.levels[c] <- paste(date.levels[c], num.in.batch, sep=" n=")
}
files.info
#rownames(genes)
write.table(files.info, file="files.info.user.batch.txt",sep="\t")
group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 6)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
}
text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)
pdf(file="Batch_Effect_Date.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
#plot(1, type="n", axes=F, xlab="", ylab="", mar=c(0,0,0,0))
plot.new()
if (num.of.colors < 12)
{
legend.txt <- date.levels[1:num.of.colors]
legend.col <- color.list[1:num.of.colors]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
} else {
legend.txt <- date.levels[1:12]
legend.col <- color.list[1:12]
legend(0, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
legend.txt <- date.levels[13:num.of.colors]
legend.col <- color.list[13:num.of.colors]
legend(0.5, 1, legend.txt , bty="n", col=legend.col,cex=0.8, pch=16)
}
text(0.75, 0.1, "By Zhengdeng Lei \nleizhengdeng@hotmail.com", cex=0.8)
dev.off()
print(paste("The figure is saved as ", wk.dir, "Batch_Effect_Date.pdf ", sep="\\"))
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
###############################################3
## To check if there is batch effect in your affy gene expression data
## input: CEL files in wk.dir
## maximum 7 colors (red, yellow, green, cyn, blue, purple, black)
## by Zhengdeng Lei (research fellow at Duke-NUS)
## April 2th 2011
## install the packages if you don't have them in R
# source("http://bioconductor.org/biocLite.R")
# biocLite("affy")
# install.packages("scatterplot3d")
rm(list=ls())
wk.dir <- "E:\\CEL\\GEMINI\\GEMINI37"
setwd(wk.dir)
genes <- read.table('GEMINI37.rma.txt', header=T)
genes<-t(genes)
pcs<-prcomp(genes)
summary(pcs)
files.info <- data.frame()
files.info <- read.table("files.info.user.batch.txt", header=T, sep="\t")
rownames(files.info) <- files.info[,1]
head(files.info)
library(scatterplot3d)
PC1<-pcs$x[,1]
PC2<-pcs$x[,2]
PC3<-pcs$x[,3]
#rownames(genes)
group.colors <- rep("#000000", length(PC1))
group.colors <- files.info[rownames(genes), "COLOR"]
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
pdf(file="Batch_Effect_Xbatches.pdf", width=10, height=10)
par(mfrow=c(2,2), mar=c(0,0,0,0))
scatterplot3d(PC1,PC2,PC3, main="PCA 1", color=group.colors, pch=16)
scatterplot3d(PC2,PC3,PC1, main="PCA 2", color=group.colors, pch=16)
scatterplot3d(PC3,PC2,PC1, main="PCA 3", color=group.colors, pch=16)
dev.off()
Friday, April 1, 2011
Subscribe to:
Posts (Atom)