This file includes scripts for diferential expression analysis and clustering using R packages applied to microarray and RNA-seq data. It is distributed with the paper: RNA-Seq vs dual- and single-channel microarray data: sensitivity analysis for differential expression and clustering Alina Sirbu1, Grainne Kerr, Martin Crane, Heather J. Ruskin ################################################################### ################################################################### ####################### DIFFERENTIAL EXPRESSION ANALYSIS ##################Affymetrix dataset library(affy) setwd('GSE5514_RAW') # path for raw data data=ReadAffy() eset=rma(data) affy=exprs(eset) affyAnnot=read.table('affyAnnot.txt',header=T) #file matching flybase annotations to affymetrix ids affyAnnot=as.matrix(affyAnnot) affyAnnot=na.omit(affyAnnot) genes=unique(affyAnnot[,1]) affyAnnot=cbind(affyAnnot,rownames(affyAnnot)) FBaffy=c() for(i in 1:length(genes)) { g=affyAnnot[affyAnnot[,1]==genes[i],] if(length(g)==2) { row=affy[g[2],] FBaffy=rbind(FBaffy,row) rownames(FBaffy)[nrow(FBaffy)]=genes[i] } else { rows=affy[g[,2],] FBaffy=rbind(FBaffy,colMeans(rows)) rownames(FBaffy)[nrow(FBaffy)]=genes[i] } } library(limma) design=model.matrix(~0+factor(c(1,10,11,12,2,3,4,5,6,7,8,9,1,10,11,12,2,3,4,5,6,7,8,9,1,10,11,12,2,3,4,5,6,7,8,9))) colnames(design)=c('wt1','wt2','wt3','wt4','wt5','wt6','wt7','wt8','wt9','wt10','wt11','wt12') scfit=lmFit(FBaffy,design) scfitt=ebayes(scfit) contrasts=makeContrasts(wt1-wt2,wt1-wt3,wt1-wt4,wt1-wt5,wt1-wt6,wt1-wt7,wt1-wt8,wt1-wt9,wt1-wt10,levels=design) fit1=contrasts.fit(scfit,contrasts) fit2=eBayes(fit1) de1=topTable(fit2,adj='BH',n=20000, coef=3) de2=topTable(fit2,adj='BH',n=20000, coef=5) de3=topTable(fit2,adj='BH',n=20000, coef=7) de4=topTable(fit2,adj='BH',n=20000, coef=9) deAll=cbind(de1,de2[rownames(de1),],de3[rownames(de1),],de4[rownames(de1),]) write.table(deAll,'affyData.txt') # this file now contains adjusted p-values for all genes in the dataset - can be used now for selecting genes of interest, comparing to other datasets and performing clustering #####################dual channel dataset library(limma) targets=readTargets() #the directory has to have Targets.txt and raw data in directory 'raw'. See limma manual for further details RG=read.maimages(targets$FileName,source="genepix", path="raw") RGb <- backgroundCorrect(RG, method="normexp", offset=50) MA <- normalizeWithinArrays(RG, method="loess") MA.q <- normalizeBetweenArrays(MA, method="Aquantile") #find flybase ids genes=unique(MA.q$genes[,'ID']) FB=c() for(i in 1:length(genes)) { g=MA.q[MA.q$genes[,'ID']==genes[i],] if(nrow(g)==1) { FB=rbind(FB,g$M) rownames(FB)[nrow(FB)]=genes[i] } else { FB=rbind(FB,colMeans(g$M)) rownames(FB)[nrow(FB)]=genes[i] } } write.table(FB,"FBExprs.txt") design=modelMatrix(targets,ref="Ref") fit=lmFit(FB,design) fitt=ebayes(fit) contrasts=makeContrasts(wt2h-wt3h,wt2h-wt6h,wt2h-wt7h,wt2h-wt8h,wt2h-wt9h,wt2h-wt10h, levels=design) fit1=contrasts.fit(fit,contrasts) fit2=eBayes(fit1) de1=topTable(fit2,adj='BH',n=20000, coef=1) de2=topTable(fit2,adj='BH',n=20000, coef=2) de3=topTable(fit2,adj='BH',n=20000, coef=4) de4=topTable(fit2,adj='BH',n=20000, coef=6) deAll=cbind(de1,de2[rownames(de1),],de3[rownames(de1),],de4[rownames(de1),]) write.table(deAll,'dualChannelData.txt') # this file now contains adjusted p-values for all genes in the dataset - can be used now for selecting genes of interest, comparing to other datasets and performing clustering ############## RNA-seq data library("DESeq") cond=c(1,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,5,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,10,11,11,11,12,12,12) counts=read.delim("EmbryoDevel_0_24HrOver0T.counts",header=TRUE,stringsAsFactors=TRUE) #file with counts data rownames(counts)=counts$GeneId counts=counts[,-1] counts=counts[,1:43] cds=newCountDataSet(counts,cond) cds=estimateSizeFactors(cds) sizeFactors(cds) cds=estimateVarianceFunctions(cds) res=list() for (i in 2:5) { res=c(res,list(nbinomTest(cds,1,i))) } write.table(res,'ngsData.txt') # this file now contains fold changes p-values and adjusted p-values for all genes in the dataset - can be used for selecting genes of interest, comparing to other datasets and performing clustering #################################################################### ########## CLUSTERING #################################################################### ############## K-Means with euclidean distance on NGS dataset library(flexclust) euclid=function() { ngs=read.table('data.txt') # file with RPKM/expression values ngs=as.matrix(ngs) genesNGSLog=ngs[,1] ngs=ngs[-1,] ngs=ngs[,-1] ngs=apply(ngs,2,as.numeric) #Log transform for(i in 1:ncol(ngs)) { for (j in 1:nrow(ngs)) { if(ngs[j,i]==0) { ngs[j,i]=mean(ngs[j,]); } } } ngs=apply(ngs,2,log2) ngs=scale(ngs) #apply K-Means with various cluster number cls=c(seq(5,20,by=1),seq(25,200,by=5)) ngsLogEClust=stepFlexclust(ngs,cls,nrep=2,save.data=TRUE,drop=FALSE,verbose=TRUE) save(genesNGSLog,ngsLogEClust,file="savedClusters4000KmeansNGSLog") } euclid() ################ Analyse K-Means clusters (NGS example) - uses clusters saved in file above (savedClusters4000KmeansNGSLog) - computes BHI index clusterEBHI=function() { load('savedClusters4000KmeansNGSLog') ccngsbhi=c() mfngsbhi=c() bpngsbhi=c() genesSC=as.character(genesNGS) Annot= as.list(org.Dm.egFLYBASE) Annot=as.matrix(Annot) Annot=cbind(Annot,rownames(Annot)) rownames(Annot)=Annot[,1] Annot=Annot[,-1] Annot=unlist(Annot) Annot=Annot[as.character(genesNGS)] for(i in 1:length(ngsEClust@k)) { clusts=clusters(ngsEClust[[i]]) names(clusts)=c(1:length(clusts)) library(clValid) bpbhi_=c() mfbhi_=c() ccbhi_=c() for(j in 1:ngsEClust@k[i]) { clust=clusts[clusts==j] Names=names(clust) GoNames=Annot[as.numeric(Names)] GoNames[is.na(GoNames)]=0 bhi=BHI(clust, names=GoNames, annotation='org.Dm.eg.db',category='BP') bpbhi_=c(bpbhi_,bhi) bhi=BHI(clust, names=GoNames, annotation='org.Dm.eg.db',category='MF') mfbhi_=c(mfbhi_,bhi) bhi=BHI(clust, names=GoNames, annotation='org.Dm.eg.db',category='CC') ccbhi_=c(ccbhi_,bhi) } bpngsbhi[[i]]=bpbhi_ ccngsbhi[[i]]=ccbhi_ mfngsbhi[[i]]=mfbhi_ detach(package:clValid) } save(ccngsbhi,mfngsbhi,bpngsbhi,file='KMbhiNGS') } library(flexclust) library(org.Dm.eg.db) clusterEBHI() ####################### compute Davies-Bouldin index for K-Means clusters library(clusterSim) ngsDB=c() ngsSize=c() for(i in 1:length(ngsLogEClust@k)) { DBi=index.DB(ngs,clusters(ngsLogEClust[[i]])) ngsDB=c(ngsDB,DBi$DB) ngsSize[[i]]= ngsLogEClust[[i]]@clusinfo$size } ######################## compute ARI index partitionsMatrix=function() { load('savedClusters4000KmeansNGSLog')# ngs clusters load('savedClusters4000KmeansDC') #dual channel clusters load('savedClusters4000KmeansSC') #single channel clusters scvsdc=list() scvsngs=list() ngsvsdc=list() for(i in 1:length(dcEClust@k)) { scvsdc[[i]]=c(0) scvsngs[[i]]=c(0) ngsvsdc[[i]]=c(0) sc=clusters(scEClust[[i]]) for(j in 1:length(dcEClust@k)) { ngs=clusters(ngsLogEClust[[j]]) dc=clusters(dcEClust[[j]]) scvsdc[[i]]=c(scvsdc[[i]],comparing.Partitions(sc,dc,type='crand')) scvsngs[[i]]=c(scvsngs[[i]],comparing.Partitions(sc,ngs,type='crand')) } ngs=clusters(ngsLogEClust[[i]]) for(j in 1:length(dcEClust@k)) { dc=clusters(dcEClust[[j]]) ngsvsdc[[i]]=c(ngsvsdc[[i]],comparing.Partitions(ngs,dc,type='crand')) } } save(scvsdc,scvsngs,ngsvsdc,file='adjustedRandKMMatrixLog') } ##################################################################################### ##################################################################################### ############## Biclustering (NGS example) library(biclust) bc=function() { ngs= read.table('data.txt', header=T) # name of file containing expression values ngs=as.matrix(ngs) genesNGSLog=ngs[,1] ngs=ngs[-1,] ngs=ngs[,-1] ngs=apply(ngs,2,as.numeric) #Log transform for(i in 1:ncol(ngs)) { for (j in 1:nrow(ngs)) { if(ngs[j,i]==0) { ngs[j,i]=mean(ngs[j,]); } } } ngs=apply(ngs,2,log2) ngs=scale(ngs) ngsLogBiclust=c() for(i in 1:10) { ngsBiclust=c(ngsLogBiclust,biclust(ngs, method=BCPlaid(), background=FALSE,max.layers=100,iter.layer=300,iter.startup=10,row.release=0.7,col.release=0.7)) } save(ngsLogBiclust,genesNGS,file="biclustersNGSLog") } bc() ################ Analyse biclusters (NGS example) - uses biclusters saved in file above (biclustersNGS) - computes BHI index library(biclust) library(clValid) analyse=function() { load('biclustersNGS') ccngsbhi=c() mfngsbhi=c() bpngsbhi=c() Annot= as.list(org.Dm.egFLYBASE) Annot=as.matrix(Annot) Annot=cbind(Annot,rownames(Annot)) rownames(Annot)=Annot[,1] Annot=Annot[,-1] Annot=unlist(Annot) GoNames=Annot[as.character(genesSC)] GoNames[is.na(GoNames)]=0 for( i in 1:length(ngsBiclust)) { ccbhi=c() bpbhi=c() mfbhi=c() for(k in 1:ngsBiclust[[i]]@Number) { clust=c() genes=c() for(j in 1:nrow(ngsBiclust[[i]]@RowxNumber)) { if(ngsBiclust[[i]]@RowxNumber[j,k]) { clust=c(clust,k) genes=c(genes,GoNames[j]) } } bhi=BHI(clust, names=genes, annotation='org.Dm.eg.db',category='CC') ccbhi=c(ccbhi,bhi) bhi=BHI(clust, names=genes, annotation='org.Dm.eg.db',category='BP') bpbhi=c(bpbhi,bhi) bhi=BHI(clust, names=genes, annotation='org.Dm.eg.db',category='MF') mfbhi=c(mfbhi,bhi) } ccngsbhi=c(ccngsbhi,ccbhi) mfngsbhi=c(mfngsbhi,mfbhi) bpngsbhi=c(bpngsbhi,bpbhi) } save(ccngsbhi,mfngsbhi,bpngsbhi,file='ngsBhiBiclust') } library(org.Dm.eg.db) analyse()