This file includes scripts for diferential expression analysis and clustering
 using R packages applied to microarray and RNA-seq data. It is distributed 
 with the paper: 
RNA-Seq vs dual- and single-channel microarray data:
sensitivity analysis for differential expression and clustering
Alina Sirbu1, Grainne Kerr, Martin Crane, Heather J. Ruskin

###################################################################
###################################################################
####################### DIFFERENTIAL EXPRESSION ANALYSIS

##################Affymetrix dataset
library(affy)
setwd('GSE5514_RAW') # path for raw data
data=ReadAffy()
eset=rma(data)

affy=exprs(eset)


affyAnnot=read.table('affyAnnot.txt',header=T) #file matching flybase annotations to affymetrix ids
affyAnnot=as.matrix(affyAnnot)
affyAnnot=na.omit(affyAnnot)


genes=unique(affyAnnot[,1])
affyAnnot=cbind(affyAnnot,rownames(affyAnnot))
FBaffy=c()
for(i in 1:length(genes))
{
	g=affyAnnot[affyAnnot[,1]==genes[i],]
	if(length(g)==2)
	{
		row=affy[g[2],]
		FBaffy=rbind(FBaffy,row)
		rownames(FBaffy)[nrow(FBaffy)]=genes[i]
	}
	else
	{
		rows=affy[g[,2],]
	
		FBaffy=rbind(FBaffy,colMeans(rows))
		rownames(FBaffy)[nrow(FBaffy)]=genes[i]
	}
	
}

library(limma)
design=model.matrix(~0+factor(c(1,10,11,12,2,3,4,5,6,7,8,9,1,10,11,12,2,3,4,5,6,7,8,9,1,10,11,12,2,3,4,5,6,7,8,9)))
colnames(design)=c('wt1','wt2','wt3','wt4','wt5','wt6','wt7','wt8','wt9','wt10','wt11','wt12')
scfit=lmFit(FBaffy,design)
scfitt=ebayes(scfit)
contrasts=makeContrasts(wt1-wt2,wt1-wt3,wt1-wt4,wt1-wt5,wt1-wt6,wt1-wt7,wt1-wt8,wt1-wt9,wt1-wt10,levels=design)
fit1=contrasts.fit(scfit,contrasts)
fit2=eBayes(fit1)
de1=topTable(fit2,adj='BH',n=20000, coef=3)
de2=topTable(fit2,adj='BH',n=20000, coef=5)
de3=topTable(fit2,adj='BH',n=20000, coef=7)
de4=topTable(fit2,adj='BH',n=20000, coef=9)
deAll=cbind(de1,de2[rownames(de1),],de3[rownames(de1),],de4[rownames(de1),])
write.table(deAll,'affyData.txt') # this file now contains adjusted p-values for all genes in the dataset - can be used now for selecting genes of interest, comparing to other datasets and performing clustering


#####################dual channel dataset
library(limma)
targets=readTargets() #the directory has to have Targets.txt and raw data in directory 'raw'. See limma manual for further details
RG=read.maimages(targets$FileName,source="genepix", path="raw")
 
RGb <- backgroundCorrect(RG, method="normexp", offset=50)

MA <- normalizeWithinArrays(RG, method="loess")
MA.q <- normalizeBetweenArrays(MA, method="Aquantile")
  
#find flybase ids
genes=unique(MA.q$genes[,'ID'])
FB=c()
for(i in 1:length(genes))
{
	g=MA.q[MA.q$genes[,'ID']==genes[i],]
	if(nrow(g)==1)
	{
		
		FB=rbind(FB,g$M)
		rownames(FB)[nrow(FB)]=genes[i]
	}
	else
	{
		
	
		FB=rbind(FB,colMeans(g$M))
		rownames(FB)[nrow(FB)]=genes[i]
	}
	
}

write.table(FB,"FBExprs.txt")

design=modelMatrix(targets,ref="Ref")
fit=lmFit(FB,design)
fitt=ebayes(fit)
contrasts=makeContrasts(wt2h-wt3h,wt2h-wt6h,wt2h-wt7h,wt2h-wt8h,wt2h-wt9h,wt2h-wt10h, levels=design)
  

fit1=contrasts.fit(fit,contrasts)
fit2=eBayes(fit1)
de1=topTable(fit2,adj='BH',n=20000, coef=1)
de2=topTable(fit2,adj='BH',n=20000, coef=2)
de3=topTable(fit2,adj='BH',n=20000, coef=4)
de4=topTable(fit2,adj='BH',n=20000, coef=6)
deAll=cbind(de1,de2[rownames(de1),],de3[rownames(de1),],de4[rownames(de1),])
write.table(deAll,'dualChannelData.txt') # this file now contains adjusted p-values for all genes in the dataset - can be used now for selecting genes of interest, comparing to other datasets and performing clustering
  
############## RNA-seq data

	library("DESeq")

	cond=c(1,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,5,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,10,11,11,11,12,12,12)
	counts=read.delim("EmbryoDevel_0_24HrOver0T.counts",header=TRUE,stringsAsFactors=TRUE) #file with counts data
	rownames(counts)=counts$GeneId
	counts=counts[,-1]
	counts=counts[,1:43]

	cds=newCountDataSet(counts,cond)
	cds=estimateSizeFactors(cds)
	sizeFactors(cds)
	cds=estimateVarianceFunctions(cds)
 
 
	res=list()
	for (i in 2:5)
	{

		res=c(res,list(nbinomTest(cds,1,i)))
	}  

	write.table(res,'ngsData.txt') # this file now contains fold changes p-values and adjusted p-values for all genes in the dataset - can be used  for selecting genes of interest, comparing to other datasets and performing clustering


####################################################################
########## CLUSTERING
####################################################################
##############  K-Means with euclidean distance on NGS dataset
library(flexclust) 

  
euclid=function()
{
  ngs=read.table('data.txt') # file with RPKM/expression values
	ngs=as.matrix(ngs)
	genesNGSLog=ngs[,1]
	ngs=ngs[-1,]
	ngs=ngs[,-1]
	ngs=apply(ngs,2,as.numeric)

	#Log transform
	for(i in 1:ncol(ngs))
	{
		for (j in 1:nrow(ngs))
		{
			if(ngs[j,i]==0)
			{
				ngs[j,i]=mean(ngs[j,]); 
			}
		}
	}
	ngs=apply(ngs,2,log2)
	ngs=scale(ngs)

	#apply K-Means with various cluster number
	cls=c(seq(5,20,by=1),seq(25,200,by=5))
	ngsLogEClust=stepFlexclust(ngs,cls,nrep=2,save.data=TRUE,drop=FALSE,verbose=TRUE)

	save(genesNGSLog,ngsLogEClust,file="savedClusters4000KmeansNGSLog")

}

euclid()	
	 

################ Analyse K-Means clusters (NGS example) - uses clusters saved in file above (savedClusters4000KmeansNGSLog) - computes BHI index

clusterEBHI=function()
{
	load('savedClusters4000KmeansNGSLog')
	
	ccngsbhi=c()
	mfngsbhi=c()
	bpngsbhi=c()
	
	genesSC=as.character(genesNGS)
	
	
	Annot= as.list(org.Dm.egFLYBASE)
	Annot=as.matrix(Annot)
	Annot=cbind(Annot,rownames(Annot))
	rownames(Annot)=Annot[,1]
	Annot=Annot[,-1]
	Annot=unlist(Annot)
	Annot=Annot[as.character(genesNGS)]	

	
	for(i in 1:length(ngsEClust@k))
	{
		clusts=clusters(ngsEClust[[i]])
		names(clusts)=c(1:length(clusts))
		library(clValid)
		bpbhi_=c()
		mfbhi_=c()
		ccbhi_=c()
		for(j in 1:ngsEClust@k[i])
		{
			clust=clusts[clusts==j]	
			Names=names(clust)
			GoNames=Annot[as.numeric(Names)]
			GoNames[is.na(GoNames)]=0

			
			bhi=BHI(clust, names=GoNames, annotation='org.Dm.eg.db',category='BP')
			bpbhi_=c(bpbhi_,bhi)
			bhi=BHI(clust, names=GoNames, annotation='org.Dm.eg.db',category='MF')
			mfbhi_=c(mfbhi_,bhi)
			bhi=BHI(clust, names=GoNames, annotation='org.Dm.eg.db',category='CC')
			ccbhi_=c(ccbhi_,bhi)
		}
		bpngsbhi[[i]]=bpbhi_
		ccngsbhi[[i]]=ccbhi_
		mfngsbhi[[i]]=mfbhi_
		detach(package:clValid)

	}
	

	
save(ccngsbhi,mfngsbhi,bpngsbhi,file='KMbhiNGS')

}

library(flexclust)
library(org.Dm.eg.db)
 clusterEBHI()

####################### compute Davies-Bouldin index for K-Means clusters
   library(clusterSim)
   
	ngsDB=c()
	ngsSize=c()

	for(i in 1:length(ngsLogEClust@k))
	{

		DBi=index.DB(ngs,clusters(ngsLogEClust[[i]]))
		ngsDB=c(ngsDB,DBi$DB)
		ngsSize[[i]]= ngsLogEClust[[i]]@clusinfo$size
		
	}
######################## compute ARI index

partitionsMatrix=function()
  {

  load('savedClusters4000KmeansNGSLog')# ngs clusters
	 
 
 load('savedClusters4000KmeansDC') #dual channel clusters
  
 
 load('savedClusters4000KmeansSC') #single channel clusters
  
  scvsdc=list()
  scvsngs=list()
  ngsvsdc=list()
  for(i in 1:length(dcEClust@k))
{
 scvsdc[[i]]=c(0)
  scvsngs[[i]]=c(0)
   ngsvsdc[[i]]=c(0)

   sc=clusters(scEClust[[i]])
   for(j in 1:length(dcEClust@k))
{
	
	ngs=clusters(ngsLogEClust[[j]])
	dc=clusters(dcEClust[[j]])
	
	
	scvsdc[[i]]=c(scvsdc[[i]],comparing.Partitions(sc,dc,type='crand'))
	scvsngs[[i]]=c(scvsngs[[i]],comparing.Partitions(sc,ngs,type='crand'))
	}
	
	ngs=clusters(ngsLogEClust[[i]])

	for(j in 1:length(dcEClust@k))
	{
	
	
	dc=clusters(dcEClust[[j]])

	
	
	ngsvsdc[[i]]=c(ngsvsdc[[i]],comparing.Partitions(ngs,dc,type='crand'))
	}

	
}


save(scvsdc,scvsngs,ngsvsdc,file='adjustedRandKMMatrixLog')

}

#####################################################################################
#####################################################################################
##############  Biclustering (NGS example)

library(biclust)
bc=function()
{

	ngs= read.table('data.txt', header=T) # name of file containing expression values

	ngs=as.matrix(ngs)
	genesNGSLog=ngs[,1]
	ngs=ngs[-1,]
	ngs=ngs[,-1]
	ngs=apply(ngs,2,as.numeric)

	#Log transform
	for(i in 1:ncol(ngs))
	{
		for (j in 1:nrow(ngs))
		{
			if(ngs[j,i]==0)
			{
				ngs[j,i]=mean(ngs[j,]); 
			}
		}
	}
	ngs=apply(ngs,2,log2)
	ngs=scale(ngs)
 
	ngsLogBiclust=c()

	for(i in 1:10)
	{
	ngsBiclust=c(ngsLogBiclust,biclust(ngs, method=BCPlaid(), background=FALSE,max.layers=100,iter.layer=300,iter.startup=10,row.release=0.7,col.release=0.7))
	}

	save(ngsLogBiclust,genesNGS,file="biclustersNGSLog")

}


bc()
 
 
################ Analyse biclusters (NGS example) - uses biclusters saved in file above (biclustersNGS) - computes BHI index

library(biclust)
library(clValid)

analyse=function()
{
	load('biclustersNGS')
	
	ccngsbhi=c()
	mfngsbhi=c()
	bpngsbhi=c()
	
	
	Annot= as.list(org.Dm.egFLYBASE)
	Annot=as.matrix(Annot)
	Annot=cbind(Annot,rownames(Annot))
	rownames(Annot)=Annot[,1]
	Annot=Annot[,-1]
	Annot=unlist(Annot)
	GoNames=Annot[as.character(genesSC)]
	GoNames[is.na(GoNames)]=0
	
	
	
	
	for( i in 1:length(ngsBiclust))
	{
		ccbhi=c()
			bpbhi=c()
			mfbhi=c()
		for(k in 1:ngsBiclust[[i]]@Number)
		{
		
			
			clust=c()
			genes=c()
			for(j in 1:nrow(ngsBiclust[[i]]@RowxNumber))
			{
				if(ngsBiclust[[i]]@RowxNumber[j,k])
				{
					clust=c(clust,k)
					genes=c(genes,GoNames[j])
				}
			}
				
		
		
			bhi=BHI(clust, names=genes, annotation='org.Dm.eg.db',category='CC')
			ccbhi=c(ccbhi,bhi)
			bhi=BHI(clust, names=genes, annotation='org.Dm.eg.db',category='BP')
			bpbhi=c(bpbhi,bhi)
			bhi=BHI(clust, names=genes, annotation='org.Dm.eg.db',category='MF')
			mfbhi=c(mfbhi,bhi)
			
		}
		ccngsbhi=c(ccngsbhi,ccbhi)
		mfngsbhi=c(mfngsbhi,mfbhi)
		bpngsbhi=c(bpngsbhi,bpbhi)
	}
	
	save(ccngsbhi,mfngsbhi,bpngsbhi,file='ngsBhiBiclust')

	
}

library(org.Dm.eg.db)
 analyse()