# makeData file for SOLiD reads - run before analysisWT.R # # Author: samarov ############################################################################### source("/Users/jzook/Documents/workspace/test1/biasAgg.R") d <- "data/WT" f <- dir(d) f # [1] "101214_WT1_FC1_PE_Quad_1_BLM1_plus_bioscope_mapall_ERCCs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [2] "101214_WT1_FC1_PE_Quad_1_BLM1_plus_bioscope_mapall_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [3] "101214_WT1_FC1_PE_Quad_1_BLM2_plus_bioscope_mapall_ERCCs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [4] "101214_WT1_FC1_PE_Quad_1_BLM2_plus_bioscope_mapall_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [5] "101214_WT1_FC1_PE_Quad_1_F3_78A_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [6] "101214_WT1_FC1_PE_Quad_1_F3_78B_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [7] "101214_WT1_FC1_PE_Quad_1_F3_EP3_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [8] "101214_WT1_FC1_PE_Quad_2_F3_EP1_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [9] "101214_WT1_FC1_PE_Quad_2_F3_EP2_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [10] "101214_WT1_FC1_PE_Quad_2_F3_EP2A_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [11] "101214_WT1_FC1_PE_Quad_3_F3_EP5_chemical_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [12] "101214_WT1_FC1_PE_Quad_3_F3_PEP_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [13] "101214_WT1_FC1_PE_Quad_4_BLM1_plus_bioscope_mapall_ERCCs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [14] "101214_WT1_FC1_PE_Quad_4_BLM1_plus_bioscope_mapall_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [15] "101214_WT1_FC1_PE_Quad_4_BLM2_plus_bioscope_mapall_ERCCs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [16] "101214_WT1_FC1_PE_Quad_4_BLM2_plus_bioscope_mapall_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [17] "101214_WT1_FC1_PE_Quad_4_F3_EP4_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [18] "101214_WT1_FC2_PE_Quad_1_BLM1_plus_bioscope_mapall_ERCCs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [19] "101214_WT1_FC2_PE_Quad_1_BLM1_plus_bioscope_mapall_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [20] "101214_WT1_FC2_PE_Quad_1_BLM2_plus_bioscope_mapall_ERCCs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [21] "101214_WT1_FC2_PE_Quad_1_BLM2_plus_bioscope_mapall_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [22] "101214_WT1_FC2_PE_Quad_1_F3_78A_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [23] "101214_WT1_FC2_PE_Quad_1_F3_78B_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [24] "101214_WT1_FC2_PE_Quad_1_F3_EP3_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [25] "101214_WT1_FC2_PE_Quad_2_F3_EP1_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [26] "101214_WT1_FC2_PE_Quad_2_F3_EP2_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [27] "101214_WT1_FC2_PE_Quad_2_F3_EP2A_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [28] "101214_WT1_FC2_PE_Quad_3_F3_EP5_chemical_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [29] "101214_WT1_FC2_PE_Quad_3_F3_PEP_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [30] "101214_WT1_FC2_PE_Quad_4_BLM1_plus_bioscope_mapall_ERCCs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [31] "101214_WT1_FC2_PE_Quad_4_BLM1_plus_bioscope_mapall_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [32] "101214_WT1_FC2_PE_Quad_4_BLM2_plus_bioscope_mapall_ERCCs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [33] "101214_WT1_FC2_PE_Quad_4_BLM2_plus_bioscope_mapall_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [34] "101214_WT1_FC2_PE_Quad_4_F3_EP4_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [35] "WT1rep_Quad1_78A_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [36] "WT1rep_Quad1_78B_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [37] "WT1rep_Quad1_BLM1plus_erccs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [38] "WT1rep_Quad1_BLM1plus_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [39] "WT1rep_Quad1_BLM2plus_erccs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [40] "WT1rep_Quad1_BLM2plus_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [41] "WT1rep_Quad1_EP3_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [42] "WT1rep_Quad2_EP1_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [43] "WT1rep_Quad2_EP2_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [44] "WT1rep_Quad2_EP2A_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [45] "WT1rep_Quad3_EP5_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [46] "WT1rep_Quad3_PEP_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [47] "WT1rep_Quad4_BLM1plus_erccs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [48] "WT1rep_Quad4_BLM1plus_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [49] "WT1rep_Quad4_BLM2plus_erccs_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [50] "WT1rep_Quad4_BLM2plus_hg18_dbsnp_132.hg18_reord_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" # [51] "WT1rep_Quad4b_EP4_ercclowpur99skip50basepolyA_cov0_GATKbias_Dinuc_RemRefBias.table.recal_data.csv" #f <- f[grep("BLM", f)] #f <- f[-grep("WT1rep", f)] nf <- length(f) ## Collecting meta information on the next gen data metaInf <- lapply(strsplit(f, "_"), function(x){ fc <- ifelse("FC1" %in% x, "FC1", "FC2") blm <- ifelse("BLM1" %in% x, "BLM1", "BLM2") std <- ifelse("ERCCs" %in% x, "ERCCs", "hg18") c(fc, blm, std) }) dataDf <- rbind() for(i in 1:nf){ dat <- read.csv(paste(d, "/", f[i], sep = ""), header = TRUE, stringsAsFactors = FALSE, as.is = TRUE) met <- metaInf[[i]] nc <- ncol(dat) dat <- cbind(dat, met[1], met[2], met[3]) names(dat)[(nc+1):(nc+3)] <- c("flow", "rna", "method") dataDf <- rbind(dataDf, dat) } dim(dataDf) # [1] 466305 10 names(dataDf) # [1] "ReadGroup" "QualityScore" "Cycle" "Dinuc" # [5] "nObservations" "nMismatches" "Qempirical" "flow" # [9] "rna" "method" ## Aggregating aggList <- biasAgg(dataDf) ## Getting the data frame dinucDf <- aggList$dinucDf cycleDf <- aggList$cycleDf