#GTI.R #This algorithm is used for differential gene expression analysis #Author: Mpindi John Patrick #Date: 17/07/2009 #Requires an outlier cut-off value for each gene dataset #The outlier cut-off can be determined as the statistical outlier or as a quantile #Create a data frame for each gene with two columns #1. Column with xpression value named xpression #2. Column with the category codes reference e.g normal refered to as 1 and the target as 2. #Returns an index value for the gene whose data was supplied. #The index value can be positive or negative depending on which group the gene is expressed #so high #****** GTI Function begins here************ GTI <- function(outlier_cutoff,x){ colnames(x) <- c("xpression","exp_type") Total_count <- nrow(x) Count_aboveN <- sum(x$xpression > outlier_cutoff & x$exp_type == 1) Count_aboveC <- sum(x$xpression > outlier_cutoff & x$exp_type == 2) count_N <- sum(x$exp_type == 1) count_C <- sum(x$exp_type == 2) Avg_aboveN <- mean(x$xpression[x$xpression > outlier_cutoff & x$exp_type == 1]) Avg_aboveC <- mean(x$xpression[x$xpression > outlier_cutoff & x$exp_type == 2]) if (!is.na(Avg_aboveN)){ index_N <- (Count_aboveN/count_N)*((Avg_aboveN - outlier_cutoff)/Avg_aboveN) }else { index_N <- 0 } if (!is.na(Avg_aboveC)) { index_C <- (Count_aboveC/count_C)*((Avg_aboveC - outlier_cutoff)/Avg_aboveC) }else{ index_C = 0 } #Up-regulated genes get a positive sign while down-regulated genes get a negative sign. Tindex <- function(x,y){ if(x>y){ T_index <- round(index_C*100 - index_N*100, digits=3) }else{ T_index <- -(round(index_N*100 - index_C*100, digits=3)) } } T_index <- Tindex(index_C,index_N) } #****** GTI Function ends here************ #Function that calls GTI function generateGeneTissueIndex <- function(x, threshold=5) { if((length(x$xpression[x$exp_type==1]) >= 2) & (length(x$xpression[x$exp_type==2]) >= 1)){ allxpr <- sort(x$xpression[x$exp_type==1]) # unname to avoid some values having names that would be propagated in the final results returned from the function for GT_indexn qq <- unname(quantile(allxpr, c(0.25, 0.5, 0.75, 0.95, 0.9))) q25 <- qq[1]; q50 <- qq[2]; q75 <- qq[3]; q95 <- qq[4] #upper outlier may depend on the user defined one or set this below as the default upper_outlier <- qq[5] iqr <- q75 - q25 lower_outlier <- q25 - iqr All_data_xpr <- data.frame(x[order(x$xpression),]) #Outlier index calculation index1 <- log2(2^iqr + 2^q75) #GTI Function taking in data frame and index type. Out_index1 <- GTI(index1, All_data_xpr[,c("xpression","exp_type")]) count_all <- nrow(All_data_xpr) count_normals <- sum(All_data_xpr$exp_type == 1) count_cancers <- sum(All_data_xpr$exp_type == 2) median_all <- median(All_data_xpr$xpression) m_normals <- median(All_data_xpr$xpression[All_data_xpr$exp_type == 1]) m_cancers <- median(All_data_xpr$xpression[All_data_xpr$exp_type == 2]) mean_normals <- mean(All_data_xpr$xpression[All_data_xpr$exp_type == 1],na.rm=TRUE) mean_cancers <- mean(All_data_xpr$xpression[All_data_xpr$exp_type == 2],na.rm=TRUE) c(GT_index1 = Out_index1, count_All = count_all, count_N = count_normals, count_C = count_cancers,median_NC = median_all, median_N = m_normals,median_C = m_cancers) }else{ median_all <- median(x$xpression) m_normals <- median(x$xpression[x$exp_type == 1]) m_cancers <- median(x$xpression[x$exp_type == 2]) c(GT_index1 = NA, count_All = nrow(x), count_N = sum(x$exp_type == 1), count_C = sum(x$exp_type == 2),median_NC = median_all, median_N = m_normals,median_C = m_cancers) } } #Extract gene information from gene data one gene at a time generate_results <- function(gene_name_ensg_id, xpr_table){ tissues <- dimnames(table(xpr_table$group_id))[[1]] key_table_list <- lapply(tissues, function(tissue) data.frame(ensg_id = gene_name_ensg_id, tissue=tissue)) key_table <- do.call("rbind", key_table_list) # compute the different statistics for each tissue tissue_data_list <- lapply(tissues, function(tissue){ xpr_table <- xpr_table[as.character(xpr_table$group_id)==tissue,] xpr_table }) gene_tissue_result <- do.call("rbind", lapply(tissue_data_list, generateGeneTissueIndex)) result_table <- data.frame(key_table, gene_tissue_result) result_table } #Function to calculate GTI indices for every gene in different tissues #Groupid may contain more than one tissue e. prostate, breast etc. generateGTI <- function(genes,dataSet) { start <- proc.time() pd <- pData(dataSet) #Check if the genes exist in the expressionset. if(!is.null(genes)){ xpr <- dataSet[intersect(featureNames(dataSet), genes),] }else{ xpr <- dataSet } xprs <- log2(exprs(xpr)) genes <- unique(rownames(xprs)) gen_dat_extract <- lapply(genes, function(gene_id){ gene_data <- xpr[gene_id,] if (length(gene_data)!=0) { xpr <- as.vector(xprs[gene_id,]) exp_types <- as.numeric(pd$type) exp_types[exp_types>1] <- 2 xpr <- data.frame(exp_type = exp_types, xpression = xpr, group_id = rep("test_tissue",length(exp_types)),stringsAsFactors=FALSE) xpr <- xpr[!is.na(xpr$xpression), ] tryCatch(generate_results(gene_id,xpr), error = function(e) { print(x); NULL}) } else NULL }) gen_dat_extract print("combining data") print(proc.time() - start) # filter out entries for genes with no data and bind inner lists to a table #result: a list of tables Summary_table <- do.call("rbind", lapply(gen_dat_extract, function(x)x)) Summary_table } #Load Biobase package dataset. library(Biobase) data(sample.ExpressionSet) genes <- featureNames(sample.ExpressionSet) results_table <- generateGTI(genes,sample.ExpressionSet)