# SOFTWARE COPYRIGHT NOTICE AGREEMENT # This software is copyright 2011 by the Eli Lilly and Company and Indiana Unversity. # All rights are reserved. # # This software is supplied without any warranty or guaranteed support # whatsoever. Neither Eli Lilly and Company nor Indiana University can be responsible for # its use, misuse, or functionality. # P P E A -- Predictive Power Estimation Algorithm ########### PPEA<-function(M, class.labels, epv=2, ratio.tt=0.7, threshold.error=0.2, num.iteration=1000000, cor.stop=0.998, num.eva=20000, classifier="svm", dir.path="", file.stem="your_file_name"){ # PPEA first applies two-way bootstrapping to manage the number of genes inversely equal # to or less than the number of samples in each splitting subset used for machine learning, # and then assesses the merit of each individual feature by evaluating its strength of # class predictability under this new low dimensional sample-feature space. # # # Inputs: # M: Let M(p×n) be the expression data matrix for p genes and n samples. # class.labels: A vector of binary labels having the 1's and the 0's. # The positive class must be labeled as 1s. The names and order of # class.labels must matched with names and order of sample in M # epv: Events per variable. Use this parameter to control the ratio of sample vs. gene # ratio.tt The ratio sets for splitting the traing and testing set # threshold.error The error cut-off sets for model evaluation as failure or success # num.iteration The number of iteration for two-way sampling # cor.stop The threshold of Spearman correlation sets for stopping the iteration # num.eva An interval of iteration used for process evaluation # classifier Specify which classifier will be used for model evaluation. PAM or SVM; SVM will # run faster # dir.path The directory for saving the ppea matrix information # file.stem The stem of file name # # Outputs: # ppea.list The PPEA function will return a list that contains a PPEA matrix and a list of corresponding Spearman correlation at # different interval ofiteration. Also, the PPEA matrix will be stored at each interval in # your specified directory ("dir.path"). A part of a ppea matrix is shown below as an example # # TOT.1840000 SUC.1840000 RATIO.1840000 RANK.1840000 # RPL26 829 389 0.469240048 1 # ANXA1 894 364 0.407158837 2 # PRKX 906 247 0.272626932 3 # UNC45A 824 215 0.26092233 4 # DONSON 875 215 0.245714286 5 # # "TOT"- the total number of times of a gene that has been used for model construction # "SUC" - the number of times of a gene that contributed to a success model # "RATIO" - "SUC"/"TOT" # "RANK"- the rank of a gene is assigned based on the "RATIO" # # # # # This software is copyright 2011 by the Eli Lilly and Company and Indiana University. # All rights are reserved. # # This software is supplied without any warranty or guaranteed support # whatsoever. Neither Eli Lilly and Company nor Indiana University can be responsible for # its use, misuse, or functionality. #ptm <-proc.time() cls.lev<-unique(class.labels) cls.p<-class.labels[class.labels==1] # positive class should be "1" cls.n<-class.labels[setdiff(names(class.labels), names(cls.p))] err.lst<-NULL gene.lst<-NULL ppea.mat<-NULL cor.lst<-NULL gene.wset<-as.vector(row.names(M)) num.print=1 library(mda) for (i in 1:num.iteration){ ## sampling the sample for training and testing clsp.tr<-sample(cls.p, round(length(cls.p)*ratio.tt)) clsp.tt<-cls.p[(setdiff(names(cls.p), names(clsp.tr)))] clsn.tr<-sample(cls.n, round(length(cls.n)*ratio.tt)) clsn.tt<-cls.n[(setdiff(names(cls.n), names(clsn.tr)))] class.tr<-c(clsp.tr,clsn.tr) class.te<-c(clsp.tt,clsn.tt) ## sampling the probe prbs.r<-NULL prbs.r<-as.vector(sample(row.names(M), round(length(class.tr)*(1/epv)))) dta.tr<-M[prbs.r,names(class.tr)] dta.te<-M[prbs.r,names(class.te)] weight.train<-100/table(as.vector(class.tr)) ### if (classifier=="PAM"|classifier=="pam"){ ## use PAM for model evaluation library(pamr) library(mda) train.dat <-NULL train.dat <- list(x = as.matrix(dta.tr), y = as.factor(class.tr), genenames = row.names(dta.tr), geneid = row.names(dta.tr), sampleid = colnames(dta.tr)) cv.out<-NULL cv.out<-capture.output({ # To disable the output message from 'pamr.train' and 'pamr.cv' mod.pam <- pamr.train(train.dat, threshold.scale=weight.train) mod.cv <- pamr.cv(mod.pam, train.dat) }) Delta=0 # to find the optimized threshold min0.pos<-which(mod.cv$error==min(mod.cv$error)) min.pos<-min(min0.pos) if (mod.cv$size[min.pos]==1){ min.pos=min(min0.pos) if (mod.cv$size[min.pos]==1){ min.pos=1 } } Delta=mod.cv$threshold[min.pos] g.lst<-NULL g.out<-NULL g.out<-capture.output({ # To disable the output message from 'pamr.listgenes' function g.lst<-pamr.listgenes(mod.pam, train.dat, Delta, genenames = FALSE) }) g.lst<-list(as.vector(g.lst[,"id"])) names(g.lst)<-"GENE" res.pam<-pamr.predict(mod.pam, dta.te, Delta) res.te<-confusion(res.pam, class.te) err.s<-NULL err.s<-attr(res.te, "error") err.lst<-as.vector(c(err.lst, err.s)) gene.lst[i]<-list(g.lst) } if (classifier=="SVM"|classifier=="svm"){ ## using SVM for model evaluation library(e1071) dta.tr<-t(dta.tr) dta.te<-t(dta.te) mod.cv <- svm(dta.tr, as.factor(class.tr), kernel="linear", cross = 10,na.action = na.omit) res.svm <- predict(mod.cv, dta.te) res.te<-confusion(res.svm, as.factor(class.te)) err.s<-NULL err.s<-attr(res.te, "error") err.lst<-as.vector(c(err.lst, err.s)) g.lst<-NULL g.lst<-list(as.vector(prbs.r)) gene.lst[i]<-list(g.lst) } ## Construct PPEA Matrix ## if (i==num.print*num.eva){ print (paste("i=", i, sep="")) tb.tot<-NULL tb.tot<-table(unlist(gene.lst)) iter.pass<-NULL iter.pass<-err.lst<=threshold.error tb.suc<-NULL if (is.na(table(iter.pass)["TRUE"])){ print (paste("No Success Subset has been found at this iteration interval " , i, sep="")) tb.suc<-as.vector(rep(0, length(unlist(gene.lst)))) names(tb.suc)<-names(tb.tot) }else{ tb.suc<-table(unlist(gene.lst[iter.pass])) } match.tot<-as.vector(rep(0, length(gene.wset))) match.tot<-replace(match.tot, gene.wset%in%names(tb.tot), tb.tot) match.suc<-as.vector(rep(0, length(gene.wset))) match.suc<-replace(match.suc, gene.wset%in%names(tb.suc), tb.suc) nm.tot<-paste("TOT.", i, sep="") nm.suc<-paste("SUC.", i, sep="") nm.ratio<-paste("RATIO.", i, sep="") nm.rank<-paste("RANK.", i, sep="") ratio.ts<-NULL ratio.ts<-match.suc/match.tot rank.s<-as.vector(rep(0, length(gene.wset))) names(rank.s)<-gene.wset rank.s<-replace(ratio.ts, is.nan(ratio.ts),0) rank.s<-replace(rank.s, rev(order(rank.s)), 1:length(gene.wset)) ppea.s<-NULL ppea.s<-cbind(match.tot, match.suc, ratio.ts, rank.s) colnames(ppea.s)<-as.vector(c(nm.tot, nm.suc, nm.ratio, nm.rank)) row.names(ppea.s)<-gene.wset ppea.mat<-cbind(ppea.mat, ppea.s) if (!is.na(dir.path)){ file.o<-paste(dir.path,file.stem, "_ppea_matrix.csv", sep="") write.table(ppea.mat, file=file.o, sep=",", row.names=T) } # Calculate the Spearman correlation if (num.print>1){ cor.s<-NULL cor.s<-cor(as.vector(ppea.mat[,ncol(ppea.mat)]), as.vector(ppea.mat[,(ncol(ppea.mat)-4)]), method = c( "spearman")) names(cor.s)<-colnames(ppea.mat)[ncol(ppea.mat)] cor.lst<-c(cor.lst, cor.s) print(paste("The Spearman correlation is ", cor.s, sep="")) if(cor.s>cor.stop){ stop("The number of iterations has reached the stable stage") } } num.print<-num.print+1 } } ppea.list<-list(PPEA=ppea.mat, Correlation=cor.lst) return(ppea.list) }