# SOFTWARE COPYRIGHT NOTICE AGREEMENT
# This software is copyright 2011 by the Eli Lilly and Company and Indiana Unversity.
# All rights are reserved.
#
# This software is supplied without any warranty or guaranteed support
# whatsoever. Neither Eli Lilly and Company nor Indiana University can be responsible for
# its use, misuse, or functionality.


# P P E A -- Predictive Power Estimation Algorithm 

###########
PPEA<-function(M, class.labels, epv=2, ratio.tt=0.7, threshold.error=0.2, num.iteration=1000000, cor.stop=0.998, 				num.eva=20000, classifier="svm", dir.path="", file.stem="your_file_name"){

# PPEA first applies two-way bootstrapping to manage the number of genes inversely equal 
# to or less than the number of samples in each splitting subset used for machine learning, 
# and then assesses the merit of each individual feature by evaluating its strength of 
# class predictability under this new low dimensional sample-feature space.	
#
#
# Inputs:
#	M: 		Let M(p×n) be the expression data matrix for p genes and n samples.
#	class.labels: 	A vector of binary labels having the 1's and the 0's. 
#			The positive class must be labeled as 1s. The names and order of 
#			class.labels must matched with names and order of sample in M
#	epv:		Events per variable. Use this parameter to control the ratio of sample vs. gene
#	ratio.tt	The ratio sets for splitting the traing and testing set
#	threshold.error	The error cut-off sets for model evaluation as failure or success 
#	num.iteration	The number of iteration for two-way sampling
#	cor.stop	The threshold of Spearman correlation sets for stopping the iteration
#	num.eva		An interval of iteration used for process evaluation
#	classifier	Specify which classifier will be used for model evaluation. PAM or SVM;  SVM will 
#			run faster
#	dir.path	The directory for saving the ppea matrix information
#	file.stem	The stem of file name
#
# Outputs:
#	ppea.list	The PPEA function will return a list that contains a PPEA matrix and a list of corresponding Spearman correlation at 
#			different interval ofiteration. Also, the PPEA matrix will be stored at each interval in 
#			your specified directory ("dir.path"). A part of a ppea matrix is shown below as an example
#
#				    TOT.1840000	SUC.1840000	RATIO.1840000	RANK.1840000
#			RPL26	829		    389		    0.469240048	    1
#			ANXA1	894		    364		    0.407158837	    2
#			PRKX	906		    247		    0.272626932	    3
#			UNC45A	824		    215		    0.26092233	    4
#			DONSON	875		    215		    0.245714286	    5
#
#			"TOT"- the total number of times of a gene that has been used for model construction 
#			"SUC" - the number of times of a gene that contributed to a success model
#			"RATIO" - "SUC"/"TOT"
#			"RANK"-  the rank of a gene is assigned based on the "RATIO"
#
#  
#
#
# This software is copyright 2011 by the Eli Lilly and Company and Indiana University.
# All rights are reserved.
#
# This software is supplied without any warranty or guaranteed support
# whatsoever. Neither Eli Lilly and Company nor Indiana University can be responsible for
# its use, misuse, or functionality.


	#ptm <-proc.time() 
	cls.lev<-unique(class.labels)
	cls.p<-class.labels[class.labels==1] # positive class should be "1"
	cls.n<-class.labels[setdiff(names(class.labels), names(cls.p))]

	err.lst<-NULL
	gene.lst<-NULL
	ppea.mat<-NULL
	cor.lst<-NULL

	gene.wset<-as.vector(row.names(M))
	num.print=1

	library(mda)

	for (i in 1:num.iteration){
		## sampling the sample for training and testing 
		clsp.tr<-sample(cls.p, round(length(cls.p)*ratio.tt))
		clsp.tt<-cls.p[(setdiff(names(cls.p), names(clsp.tr)))]
 
		clsn.tr<-sample(cls.n, round(length(cls.n)*ratio.tt))
		clsn.tt<-cls.n[(setdiff(names(cls.n), names(clsn.tr)))]

		class.tr<-c(clsp.tr,clsn.tr)
 		class.te<-c(clsp.tt,clsn.tt)

		## sampling the probe
		prbs.r<-NULL
		prbs.r<-as.vector(sample(row.names(M), round(length(class.tr)*(1/epv))))

		dta.tr<-M[prbs.r,names(class.tr)]
		dta.te<-M[prbs.r,names(class.te)]

		weight.train<-100/table(as.vector(class.tr))
		###
		if (classifier=="PAM"|classifier=="pam"){ ## use PAM for model evaluation   
			library(pamr)
			library(mda)
			train.dat <-NULL
 			train.dat <- list(x = as.matrix(dta.tr), y = as.factor(class.tr), genenames = row.names(dta.tr), 
						geneid = row.names(dta.tr), sampleid = colnames(dta.tr))
            	cv.out<-NULL
            	cv.out<-capture.output({ # To disable the output message from 'pamr.train' and 'pamr.cv'
                mod.pam <- pamr.train(train.dat, threshold.scale=weight.train)
			    mod.cv <- pamr.cv(mod.pam, train.dat)
		     })

		Delta=0
 		# to find the optimized threshold
		min0.pos<-which(mod.cv$error==min(mod.cv$error))
		min.pos<-min(min0.pos)
		if (mod.cv$size[min.pos]==1){
			min.pos=min(min0.pos)
			if (mod.cv$size[min.pos]==1){
				min.pos=1
			}
		}
		Delta=mod.cv$threshold[min.pos]

 			
           	g.lst<-NULL
            	g.out<-NULL
            	g.out<-capture.output({ # To disable the output message from 'pamr.listgenes' function
			    g.lst<-pamr.listgenes(mod.pam, train.dat, Delta, genenames = FALSE)
            	})
			g.lst<-list(as.vector(g.lst[,"id"]))
			names(g.lst)<-"GENE"
 			res.pam<-pamr.predict(mod.pam, dta.te, Delta)
 			res.te<-confusion(res.pam, class.te)

			err.s<-NULL
			err.s<-attr(res.te, "error")
			err.lst<-as.vector(c(err.lst, err.s))

			gene.lst[i]<-list(g.lst)
		}

		if (classifier=="SVM"|classifier=="svm"){ ## using SVM for model evaluation 
			library(e1071)
			dta.tr<-t(dta.tr)
			dta.te<-t(dta.te)
			mod.cv <- svm(dta.tr, as.factor(class.tr), kernel="linear", cross = 10,na.action = na.omit)
			res.svm <- predict(mod.cv, dta.te)
			res.te<-confusion(res.svm, as.factor(class.te))

			err.s<-NULL
			err.s<-attr(res.te, "error")
			err.lst<-as.vector(c(err.lst, err.s))

			g.lst<-NULL
			g.lst<-list(as.vector(prbs.r))
			gene.lst[i]<-list(g.lst)
		}

		## Construct PPEA Matrix ##
   	 	if (i==num.print*num.eva){
           	print (paste("i=", i, sep=""))
           	tb.tot<-NULL
           	tb.tot<-table(unlist(gene.lst))                        
			iter.pass<-NULL
			iter.pass<-err.lst<=threshold.error
			tb.suc<-NULL
           	if (is.na(table(iter.pass)["TRUE"])){
		        print (paste("No Success Subset has been found at this iteration interval " , i, sep=""))
             	tb.suc<-as.vector(rep(0, length(unlist(gene.lst))))
                names(tb.suc)<-names(tb.tot)
            }else{
				tb.suc<-table(unlist(gene.lst[iter.pass]))
            }
			match.tot<-as.vector(rep(0, length(gene.wset)))
			match.tot<-replace(match.tot, gene.wset%in%names(tb.tot), tb.tot)
	
			match.suc<-as.vector(rep(0, length(gene.wset)))
			match.suc<-replace(match.suc, gene.wset%in%names(tb.suc), tb.suc)

			nm.tot<-paste("TOT.", i, sep="")
			nm.suc<-paste("SUC.", i, sep="")
			nm.ratio<-paste("RATIO.", i, sep="")
			nm.rank<-paste("RANK.", i, sep="")

			ratio.ts<-NULL
			ratio.ts<-match.suc/match.tot

			rank.s<-as.vector(rep(0, length(gene.wset)))
			names(rank.s)<-gene.wset
			rank.s<-replace(ratio.ts, is.nan(ratio.ts),0)
			rank.s<-replace(rank.s, rev(order(rank.s)), 1:length(gene.wset))

			ppea.s<-NULL
			ppea.s<-cbind(match.tot, match.suc, ratio.ts, rank.s)
			colnames(ppea.s)<-as.vector(c(nm.tot, nm.suc, nm.ratio, nm.rank))
			row.names(ppea.s)<-gene.wset
			ppea.mat<-cbind(ppea.mat, ppea.s)

			if (!is.na(dir.path)){
				file.o<-paste(dir.path,file.stem, "_ppea_matrix.csv", sep="")
				write.table(ppea.mat, file=file.o, sep=",", row.names=T)
			}

			# Calculate the Spearman correlation
			if (num.print>1){
				cor.s<-NULL
				cor.s<-cor(as.vector(ppea.mat[,ncol(ppea.mat)]),
					      as.vector(ppea.mat[,(ncol(ppea.mat)-4)]), method = c( "spearman"))
				names(cor.s)<-colnames(ppea.mat)[ncol(ppea.mat)]
				cor.lst<-c(cor.lst, cor.s)

				print(paste("The Spearman correlation is ", cor.s, sep=""))

				if(cor.s>cor.stop){ 
					stop("The number of iterations has reached the stable stage")
				}
            }         	
			num.print<-num.print+1
		}
	}
	ppea.list<-list(PPEA=ppea.mat, Correlation=cor.lst)
	return(ppea.list)
}