### load libraries and initialize memory library(RColorBrewer) library(R2WinBUGS) memory.limit(4000) ### Set WinBUGs path WBPath<-"C:/program files/WinBUGS14/" ### read in data pathdata<-read.table("pathologists.txt",header=T) caseID<-as.numeric(as.factor(pathdata$CASE)) caseIDkey<-cbind(sort(unique(pathdata$CASE)),1:52) pathID<-as.numeric(as.factor(pathdata$PATHIDNUMBER)) pathIDkey<-cbind(sort(unique(pathdata$PATHIDNUMBER)),1:732) grade<-pathdata$GRADE ##### define data to feed into model data<-list("pathID","caseID","grade") ##### generate initial values for chains pathprop<-matrix(0,nrow=732,ncol=3) for(i in 1:732){ sumprop<-0 for(j in 1:3){ pathprop[i,j]<-sum((pathID==i)&(grade==j)) sumprop<-sumprop+pathprop[i,j] } pathprop[i,]<-pathprop[i,]/sumprop} tumprop<-matrix(0,nrow=52,ncol=3) for(i in 1:52){ sumprop<-0 for(j in 1:3){ tumprop[i,j]<-sum((pathID==i)&(grade==j)) sumprop<-sumprop+tumprop[i,j] } tumprop[i,]<-tumprop[i,]/sumprop} inits<-function(){ pathXl<-rep(-1,732)+2*(pathprop[,1]-0.3)+runif(732,-0.1,0.1) pathYl<-rep(1,732)-2*(pathprop[,3]-0.3)+runif(732,-0.1,0.1) tummeanl<-2*(tumprop[,3]-tumprop[,1])+runif(52,-0.2,0.2) tumvarl<-runif(52,0.5,1) pathcentl<-(pathYl+pathXl)/2 pathspreadl<-abs(pathYl-pathXl) list(pathspread=pathspreadl,pathcent=pathcentl,tummean=tummeanl,tumvar=tumvarl,centtau=0.1,spreadtau=0.5,tummtau=0.5,tumvtau=2)} ####### Fit main model gc() model<-bugs(data,inits,model.file="main.bug",parameters=c("pathX","pathY","tummean","tumvar","nohg1","nohg2","nohg3"),n.chains=3,n.iter=20000,n.burnin=17500,n.thin=2,bugs.directory=WBPath) save.image() ####### Fit model again, but record simulated complete data (note that this is so large we have to split it into 4) gc() modelq1<-bugs(data,inits,model.file="quart1.bug",parameters=c("prop"),n.chains=1,n.iter=20000,n.burnin=17500,n.thin=2,bugs.directory=WBPath) save.image() gc() modelq2<-bugs(data,inits,model.file="quart2.bug",parameters=c("prop"),n.chains=1,n.iter=20000,n.burnin=17500,n.thin=2,bugs.directory=WBPath) save.image() gc() modelq3<-bugs(data,inits,model.file="quart3.bug",parameters=c("prop"),n.chains=1,n.iter=20000,n.burnin=17500,n.thin=2,bugs.directory=WBPath) save.image() gc() modelq4<-bugs(data,inits,model.file="quart4.bug",parameters=c("prop"),n.chains=1,n.iter=20000,n.burnin=17500,n.thin=2,bugs.directory=WBPath) save.image() gc() ####### As we didn't have the memory for multiple chains fit the model again on a shorter run just to check convergence modelq12<-bugs(data,inits,model.file="quart1.bug",parameters=c("prop"),n.chains=1,n.iter=10000,n.burnin=9000,n.thin=2,bugs.directory=WBPath) save.image() gc() modelq22<-bugs(data,inits,model.file="quart2.bug",parameters=c("prop"),n.chains=1,n.iter=10000,n.burnin=9000,n.thin=2,bugs.directory=WBPath) save.image() gc() modelq32<-bugs(data,inits,model.file="quart3.bug",parameters=c("prop"),n.chains=1,n.iter=10000,n.burnin=9000,n.thin=2,bugs.directory=WBPath) save.image() gc() modelq42<-bugs(data,inits,model.file="quart4.bug",parameters=c("prop"),n.chains=1,n.iter=10000,n.burnin=9000,n.thin=2,bugs.directory=WBPath) save.image() gc() ####### Obtain marginal proportions of agreement modelmarg<-bugs(data,inits,model.file="margin.bug",parameters=c("mprop"),n.chains=1,n.iter=20000,n.burnin=17500,n.thin=2,bugs.directory=WBPath) save.image() modelcheck<-bugs(data,inits,model.file="check.bug",parameters=c("hgs"),n.chains=1,n.iter=5000,n.burnin=4000,n.thin=2,bugs.directory=WBPath) save.image() ###### in the dataset 8700 of 24178 gradings (36%) were grade 2. ###### in our simulated data, for those 24178 pathologist/tumour pairings we see ###### median 8688, IQR 8635 to 8732 so no cause for alarm there. ####### Let's save a lot of the results to external files for future access sink("results1output.txt") print(model) sink() sink("results2output.txt") print(modelq1) sink() sink("results3output.txt") print(modelq2) sink() sink("results4output.txt") print(modelq3) sink() sink("results5output.txt") print(modelq4) sink() sink("results6output.txt") print(modelmarg) sink() write(margproppath,file="results7output.txt",ncolumns=1) write(pathIDkey,file="results8output.txt",ncolumns=1) write(caseIDkey,file="results9output.txt",ncolumns=1) write(rank(tomscores),file="results10output.txt",ncolumns=1) write(apply(rankdiff1,2,quantile,p=c(0.025,0.25,0.5,0.75,0.975)),file="results11output.txt",ncolumns=5) write(apply(rankdiff1,2,mean),file="results12output.txt",ncolumns=1) write(apply(rankdiff1,2,sd),file="results13output.txt",ncolumns=1) write(apply(tempgss2,2,quantile,p=c(0.025,0.25,0.5,0.75,0.975)),file="results14output.txt",ncolumns=5) write(apply(tempgss2,2,mean),file="results15output.txt",ncolumns=1) write(apply(tempgss2,2,sd),file="results16output.txt",ncolumns=1) write(apply(rankdiff2,2,quantile,p=c(0.025,0.25,0.5,0.75,0.975)),file="results17output.txt",ncolumns=5) write(apply(rankdiff2,2,mean),file="results18output.txt",ncolumns=1) write(apply(rankdiff2,2,sd),file="results19output.txt",ncolumns=1) ########## Just perform a quick check to see if the two sets of chains converged to roughly the same thing checkconv<-rep(NA,1723) for(i in 1:1723){ checkconv[i]<-(kruskal.test(c(model$sims.array[,1,i],model$sims.array[,2,i],model$sims.array[,3,i])~rep(1:3,each=1250)))$p.value } hist(checkconv) checkconv2<-rep(NA,9517) for(i in 1:9517){ checkconv2[i]<-(wilcox.test(modelq1$sims.array[,1,i],modelq12$sims.array[,1,i]))$p.value } hist(checkconv2) checkconv3<-rep(NA,9517) for(i in 1:9517){ checkconv3[i]<-(wilcox.test(modelq2$sims.array[,1,i],modelq22$sims.array[,1,i]))$p.value } hist(checkconv3) checkconv4<-rep(NA,9517) for(i in 1:9517){ checkconv4[i]<-(wilcox.test(modelq3$sims.array[,1,i],modelq32$sims.array[,1,i]))$p.value } hist(checkconv4) checkconv5<-rep(NA,9517) for(i in 1:9517){ checkconv5[i]<-(wilcox.test(modelq4$sims.array[,1,i],modelq42$sims.array[,1,i]))$p.value } hist(checkconv5) ########## Figure 1 prop<-cbind(modelq1$mean$prop,modelq2$mean$prop,modelq3$mean$prop,modelq4$mean$prop) margproppath<-apply(prop,2,mean) margproptum<-apply(prop,1,mean) #bltr layout(matrix(c(1,2,3,4,5,6),nrow=2,ncol=3,byrow=T),widths=c(0.05,0.75,0.2),heights=c(0.8,0.2)) # figure 1: print key to figure 2 par(mar=c(2,3,3,0)) image(1,1:10,t(matrix((0:9)/10+0.05,ncol=1)),col=(brewer.pal(10,"RdYlGn")),ylim=c(0.5,10.5),axes=F,ylab="",xlab="") axis(2,at=c(0:10+0.5),labels=c("0","0.1","0.2","0.3","0.4","0.5","0.6","0.7","0.8","0.9",1)) # figure 2 show heatmap of pathologist/tumour agreement par(mar=c(2,1,3,3)) image(1:732, 1:52, t(prop[order(model$mean$tummean),order(margproppath)]), col=(brewer.pal(10, "RdYlGn")), axes = FALSE, xlab = "", ylab = "") axis(4,at=c(1:52),labels=caseIDkey[order(model$mean$tummean)],las = 2, line = -0.5,tick=F) mtext("Heatmap showing probabilities that pathologists will agree with majority grading for 52 tumour samples",3,1,cex=0.8) mtext("732 pathologists ordered by marginal probability of agreeing with the majority",1,0.5,cex=0.8) # figure 3 show distributions of grades par(mar=c(2,1,3,3)) plot(c(0,732),c(-4.9,5.5),type="n",ylab="tumours ordered by latent severity",axes=F,xlim=c(0,750),ylim=c(-4.85,5.5),yaxs="i") for(i in 1:52){ rect( 0, -5.1+0.2*i,model$mean$nohg1[order(model$mean$tummean)[i]],-4.9+0.2*i,col="yellow2",border=FALSE) rect(model$mean$nohg1[order(model$mean$tummean)[i]],-5.1+0.2*i,model$mean$nohg1[order(model$mean$tummean)[i]]+model$mean$nohg2[order(model$mean$tummean)[i]],-4.9+0.2*i,col="darkgoldenrod2",border=FALSE) rect(model$mean$nohg1[order(model$mean$tummean)[i]]+model$mean$nohg2[order(model$mean$tummean)[i]],-5.1+0.2*i,model$mean$nohg1[order(model$mean$tummean)[i]]+model$mean$nohg2[order(model$mean$tummean)[i]]+model$mean$nohg3[order(model$mean$tummean)[i]],-4.9+0.2*i,col="darkgoldenrod4",border=FALSE) } for(i in 1:51){lines(c(0,730),c((-4.9+0.2*i),(-4.9+0.2*i)))} axis(1,at=c(0,732)) mtext("expected distribution of assigned grades",3,1,cex=0.8) mtext("52 tumours ordered by estimated latent severity",4,0,cex=0.8) # figure 4 dummy space filler plot(c(0,732),c(-4.9,5.5),type="n",axes=F,xlim=c(0,750),ylim=c(-4.9,5.5)) # figure 5 marginal pathologist performance par(mar=c(0.5,1,0.5,3)) plot(c(0,732),c(0.4,1),type="n",axes=F,xlim=c(0,732),xaxs="i") box() axis(2,at=c(0.4,0.5,0.6,0.7,0.8,0.9,1)) lines(c(0,43),c(0.75,0.75),lty=2,col="grey50") lines(c(0,197),c(0.8,0.8),lty=2,col="grey50") lines(c(0,10),c(0.7,0.7),lty=2,col="grey50") lines(c(0,732),c(0.9,0.9),lty=2,col="grey50") lines(c(10,10),c(0.45,0.7),lty=2,col="grey50") lines(c(43,43),c(0.45,0.75),lty=2,col="grey50") lines(c(197,197),c(0.45,0.8),lty=2,col="grey50") points(1:732,margproppath[order(margproppath)],xpd=T,col="dodgerblue4") text(10,0.45,"10th",col="palevioletred4",adj=c(0.5,1),xpd=T) text(43,0.45,"43rd",col="palevioletred4",adj=c(0.5,1),xpd=T) text(197,0.45,"197th",col="palevioletred4",adj=c(0.5,1),xpd=T) mtext("marginal probability\nof agreeing",2,3,xpd=T,cex=0.8) # figure 6 key for figure 3 par(mar=c(0.5,1,0.5,3)) plot(c(0,10),c(0,5),type="n",axes=F) rect(1,4,3,5,col="yellow2") rect(4,4,6,5,col="darkgoldenrod2") rect(7,4,9,5,col="darkgoldenrod4") text(2,3.5, "Grade 1") text(5,3.5, "Grade 2") text(8,3.5, "Grade 3") ####################### Figure 2 comparing the two approaches pathrank<-apply(t(apply(modelmarg$sims.matrix[,1:732],1,rank,ties.method="r")),2,quantile,p=c(0.1,0.5,0.9)) # calculate the agreement scores tomscores<-rep(0,732) trfnp<-rep(0,732) for(i in 1:732){ trfnp[i]<-sum(pathID==i) tempscores<-rep(0,trfnp[i]) cases<-caseID[pathID==i] tempgrades<-grade[pathID==i] for(j in 1:trfnp[i]){ tempscores[j]<-(sum(grade[caseID==cases[j]]==tempgrades[j])-1)/(sum(caseID==cases[j])-1) } tomscores[i]<-mean(tempscores) } # dubious function for uncertainty of agreement scores gensampscores2<-function(){ sampscores<-rep(0,732) for(i in 1:732){ pgrade=rep(0,52) pgrade[caseID[pathID==i]]<-grade[pathID==i] sampscores[i]<-mean(rbeta(trfnp[i],(tapply(grade==pgrade[caseID],caseID,sum))[caseID[pathID==i]],(table(caseID)-(tapply(grade==pgrade[caseID],caseID,sum)))[caseID[pathID==i]]+1)) } rank(sampscores) } calculate rank differences rankdiff1<-t(apply(modelmarg$sims.matrix[1:625,1:732],1,rank,ties.method="r")) rankdiff2<-t(apply(modelmarg$sims.matrix[626:1250,1:732],1,rank,ties.method="r")) for(i in 1:625){ rankdiff1[i,]<-rankdiff1[i,]-rank(tomscores) rankdiff2[i,]<-rankdiff2[i,]-gensampscores2() } mrd<-apply(rankdiff1,2,mean) pathdiff2<-rep(0,732) for(i in 1:732){ pathdiff2[i]<-mean(margproptum[caseID[pathID==i]]) } pathdiff2q<-(pathdiff2>quantile(pathdiff2,0.9))+(pathdiff2>quantile(pathdiff2,0.8))+(pathdiff2>quantile(pathdiff2,0.7))+(pathdiff2>quantile(pathdiff2,0.6))+(pathdiff2>quantile(pathdiff2,0.5))+(pathdiff2>quantile(pathdiff2,0.4))+(pathdiff2>quantile(pathdiff2,0.3))+(pathdiff2>quantile(pathdiff2,0.2))+(pathdiff2>quantile(pathdiff2,0.1))+1 boxplot(mrd~pathdiff2q,axes=F,xlab="pathologists by quantile of difficulty of observed samples",ylab="mean difference in ranks",col="grey50") axis(2,at=c(-400,-200,0,200,400),labels=c("does best via\nagreement score\n-400","-200","0","200","does best via\nlatent trait\n400")) axis(side=1,labels=c("saw difficult samples","middling","saw easy samples"),at=c(2,5.5,9),tick=F) box() abline(h=0,lty=2) #################### Figure 3 the scatterplot of grade boundaries win.graph() layout(matrix(c(1,2,3),nrow=1,ncol=3,byrow=T),widths=c(0.1,0.15,0.75)) par(mar=c(5.1,3,3,0)) image(1,1:10,t(matrix((0:9)/10+0.05,ncol=1)),col=(brewer.pal(10,"RdYlGn")),ylim=c(0.5,10.5),axes=F,ylab="",xlab="") axis(2,at=c(0:10+0.5),labels=c("0","0.1","0.2","0.3","0.4","0.5","0.6","0.7","0.8","0.9",1)) mtext("key",1,padj=1) par(mar=c(5.1,1,3,3)) image(1:4, 1:52, t(prop[order(model$mean$tummean),c(156,275,143,247)]), col=(brewer.pal(10, "RdYlGn")), axes = FALSE, xlab = "", ylab = "") #axis(4,at=c(0:53),labels=c("high grade",(caseIDkey[order(model$mean$tummean)]),"low grade"),las = 2, line = -0.5,tick=F) #axis(4,at=c(1,8,15.5,23,34.5,45,52),labels=c("least aggressive","grade 1","grade\nboundary","grade 2","grade\nboundary","grade3","most aggressive"),las = 2, line = -0.5,tick=F) axis(4,at=c(1,15.5,34.5,52),labels=c("least aggressive","grade\nboundary","grade\nboundary","most aggressive"),las = 2, line = -0.5,tick=F) axis(4,at=c(8,25,44),labels=c("grade 1","grade 2","grade3"),las = 0, line = -0.5,tick=F) mtext("probability of individuals\nagreeing (by tumour)",1,padj=1) mtext("A",3,at=1,line=1,cex=0.8) mtext("B",3,at=2,line=1,cex=0.8) mtext("C",3,at=3,line=1,cex=0.8) mtext("D",3,at=4,line=1,cex=0.8) par(mar=c(5.1,6.1,4.1,1.1)) plot(model$median$pathX,model$median$pathY,pch=16,xlab="pathologist's boundary for grades 1 and 2",ylab="pathologist's boundary for grades 2 and 3",cex=1) points(model$median$pathX[c(156,275,143,247)],model$median$pathY[c(156,275,143,247)],col="red",pch=16) abline(v=mean(model$median$pathX),lty=2,col="red") abline(h=mean(model$median$pathY),lty=2,col="red") temp1<-(model$median$pathX+model$median$pathY)/2 temp2<-(model$median$pathX-model$median$pathY) pathdist<-((temp2-mean(temp2))/sd(temp2))^2+((temp1-mean(temp1))/(sd(temp1)))^2 rm(temp1,temp2) text(model$median$pathX[156],model$median$pathY[156]," A",adj=0) text(model$median$pathX[275],model$median$pathY[275]," B",adj=0) text(model$median$pathX[143],model$median$pathY[143]," C",adj=0) text(model$median$pathX[247],model$median$pathY[247]," D",adj=0) ##################### Figure 4 variability of measure pathranks<-t(apply(modelmarg$sims.matrix[,1:732],1,rank,ties.method="r")) findtightest<-function(data){ data<-sort(data) ld<-length(data) ul<-floor(ld/20) temp<-rep(NA,ul) for(i in 1:ul){ temp[i]<-data[i+ceiling(19*ld/20)]-data[i] } index<-which(temp==min(temp,na.rm=T)) if(length(index)>1){ index<-index[(index-ul)^2==min((index-ul)^2)] } if(length(index)>1){ index<-index[1] } return(c(data[index],median(data),data[index+ceiling(19*ld/20)])) } pathrank2<-matrix(0,nrow=3,ncol=732) for(i in 1:732){ pathrank2[,i]<-findtightest(pathranks[,i]) } plot(pathrank2[2,order(pathrank2[2,])],pch=".",ylim=c(1,810),xlab="pathologists ordered by median rank",ylab="rank of pathologist") axis(4) axis(1,at=c(100,700),labels=c("low agreement","high agreement"),las = 0, line = -0.5,tick=F,col="blue") axis(2,at=c(100,700),labels=c("low\nagreement","high\nagreement"),las = 0, line = -0.5,tick=F,col="blue") lines(1:732,pathrank2[1,order(pathrank2[2,])],col="blue") lines(1:732,pathrank2[3,order(pathrank2[2,])],col="blue") points(1:20,rep(800,20),pch=".") lines(c(1,20),c(770,770),col="blue") text(25,800,adj=0,"median rank from latent variable analysis") text(25,770,adj=0,"credible range from latent variable analysis")