Sunday, October 28, 2007

Affymetrix example: estrogen

library(affy)
library(estrogen)

datadir <- system.file("extdata", package="estrogen")
dir(datadir)
setwd(datadir)

pd <- read.phenoData("estrogen.txt", header=T, row.names=1)
pData(pd)

raw <- ReadAffy(filenames=rownames(pData(pd)), phenoData=pd, verbose=T)
norm <- justRMA(filenames=rownames(pData(pd)))

### CEL file image
image(raw[,1])
image(ReadAffy("bad.cel"))

### Histogram
hist(intensity(raw[,4]), breaks=100, col="blue")
hist(log2(intensity(raw[,4])), breaks=100, col="blue")

### Boxplot
boxplot(raw, col="red")
boxplot(data.frame(exprs(norm)), col="blue")

### Scatterplot
plot(exprs(raw)[,1:2], log="xy", pch=".", main="all")
plot(pm(raw)[,1:2], log="xy", pch=".", main="pm")
plot(mm(raw)[,1:2], log="xy", pch=".", main="pm")
plot(exprs(norm)[,1:2], pch=".", main="normalized")

### Heatmap
library(genefilter)
rsd <- rowSds(exprs(norm)) # rsd <- apply(exprs(norm),1,sd)
sel <- order(rsd, decreasing=T)[1:50]
heatmap(exprs(norm)[sel,], col=gentlecol(256))

### ANOVA
lm.coef <- function(y) lm(y~estrogen*time.h, data=pData(pd))$coefficients
eff <- esApply(norm, 1, lm.coef)
affyids <- colnames(eff)

lowest <- sort(eff[2,], decreasing=F)[1:3]
mget(names(lowest), hgu95av2GENENAME)
highest <- sort(eff[2,], decreasing=T)[1:3]
mget(names(highest), hgu95av2GENENAME)

### MA plot
MAplot(raw, pairs=T)

### Probeset
gn <- geneNames(raw)
ps <- probeset(raw, gn[1:2])
ps[[1]]
cbind(pm(ps[[1]])[,1],mm(ps[[1]])[,1])

SNPassoc

library(SNPassoc)
data(SNPs)

### Bonferroni.sig
datSNP <- setupSNP(SNPs, 6:40, sep="")
ans <- WGassociation(protein~1, data=datSNP, model="all")
Bonferroni.sig(ans, model="codominant", alpha=0.05, include.all.SNPs=F)

### SNPs.info.pos
data(SNPs.info.pos)
SNPs.info.pos

### Table.N.Per
Table.N.Per(SNPs$snp10001, SNPs$casco)
Table.N.Per(SNPs$snp10001, SNPs$casco, SNPs$sex=="Male")
Table.N.Per(dominant(snp(SNPs$snp10001,sep="")), SNPs$casco)

### Table.mean.se
Table.mean.se(SNPs$snp10001, SNPs$protein)
Table.mean.se(SNPs$snp10001, SNPs$protein, SNPs$sex=="Male")
Table.mean.se(dominant(snp(SNPs$snp10001,sep="")), SNPs$protein, SNPs$sex=="Male")
#tapply(SNPs$protein, SNPs$snp10001, mean)

### model = c("codominant", "dominant", "recessive", "overdominant", "log-additive", "all")
ansCoAd <- WGassociation(protein~1, data=datSNP, model=c("codominant","log-add"))
plot(ansCoAd)
summary(ansCoAd)

phenoData & exprSet

library(Biobase)

fake.data <-matrix(rnorm(8*200), ncol=8)

sample.info<-data.frame(spl=paste('A',1:8,sep=''), stat=rep(c('healthy','cancer'),c(4,4)))

pheno<-new("phenoData", pData=sample.info, varLabels=list('Sample Name', 'Cancer Status'))

my.experiments <- new("exprSet", exprs=fake.data, phenoData=pheno)

CDF data packages, probe-specific access

library(affy)
library(affydata)

data(Dilution)
annotation(Dilution)
[1] "hgu95av2"
data(hgu95av2cdf)

# get gene names from a CDF file
gnames <- ls(env=hgu95av2cdf)

# get a data with the 1st gene name
get(gnames[1], env=hgu95av2cdf)

# Read a CEL file
celegans <- make.cdf.env("celegans.cdf")
gnames <- ls(env=celegans)

Cluster Analysis of Genomic Data

library(cluster)
data(agriculture)

### pam() - Partitioning Around Medoids
part <- pam(agriculture, k=2)
plot(part, which.plots=1, labels=3, col.clus=3, lwd=2, main="PAM")

### diana() - Divisive Hierarchical Algorithm
hier <- diana(agriculture)
plot(hier, which.plots=2, lwd=2, main="DIANA")

### agnes() - Agglomerative Hierarchical Algorithm

### Visualizing clustering result
heatmap(as.matrix(t(agriculture)), Rowv=NA, labRow=c("GNP", "% in Agriculture"), cexRow=1,xlab="Country")

Marray

library(marray)

datadir <- system.file("swirldata", package="marray")
dir(datadir)
swirlTargets <- read.marrayInfo(file.path(datadir, "SwirlSample.txt"))

# To read Spot files (.spot)
mraw <- read.Spot(targets=swirlTargets, path=datadir)

# To read GenePix files (.gpr)
data <- read.GenePix(targets=swirlTargets)

galinfo <- read.Galfile("fish.gal", path=datadir)
mraw@maLayout <- galinfo$layout
mraw@maGnames <- galinfo$gnames

# Array quality assessment
library(arrayQuality)
maQualityPlots(mraw)
image(mraw[,1])
boxplot(mraw)
plot(mraw[,2])

# Normalization
normdata <- maNorm(mraw)
summary(normdata)

# Write normalized data
write.marray(normdata)

# data analysis with limma
LMres <- lmFit(normdata, design=c(1,-1,-1,1), weights=NULL)
LMres <- eBayes(LMres)
restable <- toptable(LMres, number=50, genelist=maGeneTable(normdata), resort.by="M")

# output
table2html(restable, disp="file")

Limma

library(limma)

setwd("d:/agti/BioCat/swirl")

targets <- readTargets("SwirlSample.txt")

# source <- c( "genepix", "genepix.median", "imagene", "quantarray", "smd.old", "smd", "spot", "spot.close.open")

RG <- read.maimages(targets$FileName, source="spot")

RG$genes <- readGAL("fish.gal")

spottypes <- readSpotTypes("SpotTypes.txt") # information spot types

RG$genes$Status <- controlStatus(spottypes, RG)

RG$printer <- getLayout(RG$genes)


# Normalize Within
MAw <- normalizeWithinArrays(RG) # bacground correction method = "subtract"

# method <- c("none", "subtract", "half", "minimum", "movingmin", "edwards", "normexp")
RGb <- backgroundCorrect(RG, method="subtract")
MAw <- normalizeWithinArrays(RGb) # bacground correction method = "subtract"

# Normalize Between
MAwb <- normalizeBetweenArrays(MAw)

### Unnormalized Data
plotMA(RG[,2]) # Color-Coded M A Plot (for one slide) for the unnormalized data
plotPrintTipLoess(RG[,2]) # Print-Tip Group M A Plot (for one slide)
imageplot(log2(RG$R[,1]), RG$printer, low="white", high="red") # image array plot
boxplot(RG$R~col(RG$R))

### Normalized Data
plotMA(MAw[,2])
plotPrintTipLoess(MAw)
imageplot(MAw$M[,1], RG$printer, zlim=c(-3,3))
boxplot(MAw$M~col(MAw$M), names=colnames(MAw$M))
boxplot(MAwb$M~col(MAwb$M), names=colnames(MAwb$M))

### Linear Model
design <- c(-1, 1, -1, 1)
fit <- lmFit(MAwb, design)

# ordinary.t <- fit$coef / fit$stdev.unscaled / fit$sigma
fiteb <- eBayes(fit)

options(digits=3)

### Number of genes in table
ngene = c(10,30,50,100,nrow(fit2))
mgene = ngene[menu(ngene, graphics=F, title="Number of genes in table?")]

### Sort by
sortby = c("M","A","t","P","B")
msortby = sortby[menu(sortby, graphics=F, title="Sort by?")]

### Adjust method
adjust = c("bonferroni","holm","hochberg","hommel","fdr","none")
madjust = adjust[menu(adjust, graphics=F, title="Adjust method?")]
topTable(fiteb, coef=1, number=mgene, sort.by=msortby, adjust=madjust)

# MA Plot (using fitted M values)
plotMA(fiteb)
ord <- order(fiteb$lods, decreasing=T)
top <- ord[1:mgene]
text(fiteb$Amean[top], fiteb$coef[top], labels=fiteb$genes[top,"Name"], cex=0.8, col="blue")

# Log Odds Plot
volcanoplot(fiteb, highlight=8, names=fiteb$genes$Name)

# Quantile-Quantile t statistic plot
qqt(fiteb$t, df=fiteb$df.prior+fiteb$df.residual, pch=16, cex=0.2)
abline(0,1)

Affy

### Introduction
probe: oligonucleotides of 25 base pair length used to probe RNA targets
perfect match: probes intended to match perfectly the target sequence
mismatch: the probes having one base mismatch with the target sequence intended to account for non-specific binding
probe pair: a unit composed of a perfect match and its mismatch
affyID: an identification for a probe set
probe pair set: PMs and MMs related to a common affyID
CEL file contain measured intended intensities and locations for an array that has been hybridized
CDF file contain the information relating probe pair sets to locations on the array

### Quick Start
> library(affy)
> input <- c("day_7a_amp.CEL","day_7b_amp.CEL","day_7c_amp.CEL", "day_7a_HIV_amp.CEL", "day_7b_HIV_amp.CEL","day_7c_HIV_amp.CEL")
> data <- ReadAffy(filenames=input)
> hiv <- rma(data) # the following will do the 2 previous steps
> hiv <- justRMA(filenames=input) // read and normalize, very quickly > exprs2excel(hiv.e, file="hiv.csv")
> write.exprs(hiv.e, file="hiv.txt")

### Normalization with expresso
> hiv.raw <- ReadAffy(filenames=input) > bgcorrect.methods
[1] "mas" "none" "rma" "rma2"

# "rma" and "rma2" should be used with pmcorrect.method="pmonly"
> normalize.methods(affybatch.example)
[1] "constant" "contrasts" "invariantset" "loess"
[5] "qspline" "quantiles" "quantiles.robust"

> pmcorrect.methods
[1] "mas" "pmonly" "subtractmm"

> express.summary.stat.methods
[1] "avgdiff" "liwong" "mas" "medianpolish" "playerout"

# Expression value with "medianpolish" will be in log2 scale, so it should be used with pmcorrct.method="subtractmm", which can cause negative value

# expresso for rma
hiv <- expresso(hiv.raw, bgcorrect.method="rma", normalize.method="quantiles", pmcorrect.method="pmonly", summary.method="medianpolish")

# one more example
hiv <- expresso(hiv.raw, bgcorrect.method="mas", normalize.method="qspline", pmcorrect.method="subtractmm", summary.method="playerout")

### Save expressed values as a csv file
> write.csv(exprs(hiv), "hiv.csv")

### Read a saved expressed file
> hiv <- read.csv("hiv.csv", row.names=1)

### Quality Control
> MAplot(hiv.raw, pairs=T)
> hist(hiv.raw[,1:2])

> par(mfrow=c(3,2))
> image(hiv.raw)

> par(mfrow=c(1,1))
> boxplot(hiv.raw, col=1:6) // 6 colors

> phenoData(hiv.raw)

> pData(hiv.raw)

> pm(hiv.raw)
> mm(hiv.raw)
> probeNames(hiv.raw)
> geneNames(hiv.raw) // extracts the unique affyIDs
> sampleNames(hiv.raw)
> intensity(hiv.raw)

### Location to ProbeSet Mapping
> gn <- geneNames(hiv.raw)
> ps <- probeset(hiv.raw, gn[1:2])
> show(ps) // same as just typing ps
> pm(ps[[1]])

> index <- indexProbes(hiv.raw, which="pm", genenames=gn[1])[[1]]
// get indexs for the PM probes for the affyID "AB002365_at", which=c("pm", "mm", "both")

> intensity(hiv.raw)[index,]