# 01 Feb 2011 # G. Robertson # Testing edgeR on miRNA-seq consensus clusters: pairwise # k=6 consensus clusters for 173 LAML samples, q80 miRs # # Robinson MD, McCarthy DJ, Smyth GK. # edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. # Bioinformatics. 2010 Jan 1;26(1):139-40. # source("/projects/remc_bigdata/TCGA/KIRC/20110728/chromophobe.edgeR.xhost09.20110728.R") ################################################### ### chunk number 1: poisson ################################################### library(edgeR) set.seed(101) cat("\nread expression matrix\n") mydata <- read.table(file="/projects/remc_bigdata/TCGA/KIRC/20110728/expn_matrix_passed_TCGA.txt", header = TRUE, row.names = 1, sep="\t", as.is=TRUE) #mydata <- read.table(file="/projects/remc_bigdata/TCGA/LAML/differential_enrichment/20110211/COMBINED_expn_matrix_norm_passed_TCGA.txt", header = TRUE, row.names = 1, sep="\t") dim(mydata) nbr.cols <- ncol(mydata) cat("calculate library.sizes\n") # Need a library.sizes list. mydata.lib.sizes <- colSums(mydata) # lib.sizes[[1]] is the sum of the first data column of 'mydata' cat("read groups list\n") #cat("read list of cluster assignments across the sample dendrogram\n") # Need to assign consensus cluster numbers to each column of mydata # From ConsensusClusterPlus, have cluster assignments. Use 'header=F' to avoid skipping the first row in the file. # "TCGA.AB.2803",1 # "TCGA.AB.2807",2 # "TCGA.AB.2963",3 mydata.groups <- read.csv( file="/projects/remc_bigdata/TCGA/KIRC/20110728/groups.csv", header=F ) #mydata.groups <- read.table( file="/projects/remc_bigdata/TCGA/KIRC/20110728/groups.csv", header=F, sep=",", as.is=TRUE) mydata.groups.matrix <- as.matrix(mydata.groups) #consensusClass <- read.csv( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/LAML/data/miRNA-Seq/20110126_173-samples/consensus_clustering/no-star_km_spearman_q80Var_reps1000/no-star_km_spearman_q80Var_reps1000.k=6.consensusClass.csv", header=F) #dim(consensusClass) # 173, 2 #consensusClassClusterNbrs <- consensusClass[[2]] #cat("set the cluster of interest\n") ######################### #clusterOfInterest <- 1 ######################### # Use the next line to do the analysis on the cluster of interest vs. all other clusters #mydata.groups <- as.integer( consensusClassClusterNbrs==clusterOfInterest ) # Use the next line to do the analysis on the other clusters vs. the cluster-of-interest #mydata.groups <- as.integer( consensusClassClusterNbrs!=clusterOfInterest ) ######################### length(mydata.groups.matrix) #pairwiseLabelsFn <- function(k, lis) { as.integer(lis==k) } cat("load the DGEList data structure\n") # This finishes almost instantly, because it's just setting up a data structure d <- DGEList( counts=mydata, group=mydata.groups.matrix, lib.size=mydata.lib.sizes ) #dP$samples # group lib.size norm.factors #TCGA.AB.2803 1 993556.1 1 #TCGA.AB.2807 0 991049.3 1 #... cat("calculate normalization factor(s)\n") # This takes only a second to run d <- calcNormFactors(d) #dP$samples # group lib.size norm.factors #TCGA.AB.2803 1 993556.1 0.5241402 #TCGA.AB.2807 0 991049.3 1.5994537 #TCGA.AB.2963 0 995427.2 0.8469909 #TCGA.AB.2826 0 993389.5 0.8838622 #... #cat("write MA plot\n") #par( mfrow = c(1, 2) ) #maPlot( d$counts[, 1], d$counts[, 2], normalize = TRUE, pch = 19, cex = 0.4, ylim = c(-8, 8) ) #grid( col = "blue" ) #abline( h = log2(d$samples$norm.factors[2]/d$samples$norm.factors[1]), col = "red", lwd = 4 ) #eff.libsize <- d$samples$lib.size * d$samples$norm.factors #maPlot( d$counts[, 1]/eff.libsize[1], d$counts[, 2]/eff.libsize[2], normalize = FALSE, pch = 19, cex = 0.4, ylim = c(-8, 8) ) #grid( col = "blue" ) cat("estimate common dispersion\n") # common dispersion d <- estimateCommonDisp(d) names(d) # [1] "samples" "common.dispersion" "counts" "pseudo.alt" "genes" "all.zeros" "conc" # [8] "common.lib.size" d$samples$lib.size d$common.lib.size # 993484.2 d$common.dispersion # [1] 1.010957 cat("run exactTest(d)\n") # Takes ~5 min to run # Using common dispersion? ######################## de.com <- exactTest(d) ######################## names(de.com) # "table" "comparison" "genes" names(de.com$table) options(digits = 4) #topTags(de.com) #Comparison of groups: 1-0 # logConc logFC PValue FDR #hsa-mir-1269 -13.776 6.870 1.132e-140 1.596e-138 #hsa-mir-136 -13.415 6.633 1.095e-132 7.718e-131 #hsa-mir-431 -14.516 6.517 7.859e-125 3.694e-123 #hsa-mir-127 -9.553 6.346 4.816e-123 1.697e-121 #hsa-mir-381 -12.398 6.236 8.383e-118 2.364e-116 #hsa-mir-493 -13.745 6.246 1.782e-116 4.187e-115 #hsa-mir-654 -12.480 6.019 4.821e-109 9.710e-108 #hsa-mir-337 -13.968 6.016 3.126e-107 5.510e-106 #hsa-mir-379 -9.855 5.919 6.223e-106 9.749e-105 #hsa-mir-409 -13.555 5.865 4.194e-102 5.914e-101 #cat("add BH-corrected p-val to de.com$table\n") #xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table, adjp=p.adjust(de$table$p.value, method="BH") ) #de.com$table <- cbind( de.com$table, adjp=p.adjust( de.com$table$p.value, method="BH") ) #topTags(de.com, sort.by = "logFC") #tt <- topTags(de.com, sort.by = "pvalue") #tt <- topTags(object, n=10, adjust.method="BH", sort.by="p.value") #> d$counts[detags.com, ] #Error in d$counts : $ operator is invalid for atomic vectors sum(de.com$table$p.value < 0.01) # 43 topN <- topTags( de.com, n = 1000, adjust.method="BH", sort.by="p.value" ) #names(topN) # "table" "adjust.method" "comparison" cat("write out the pval-sorted results table\n") write.table( topN$table, "/projects/remc_bigdata/TCGA/KIRC/20110728/KIRC-chromophobe.all.20110728_12h15.txt",sep="\t", row.names=T, quote=FALSE ) #nbrPosLogFC <- sum(topN$table$logFC > 0) # 44 #nbrMegLogFC <- sum(topN$table$logFC < 0) # 97 #sum( p.adjust(de.com$table$p.value, method = "BH") < 0.05 ) # 45 (38 have p.adj < 0.01) #mean( p.adjust(de.com$table$p.value, method = "BH") < 0.05 ) * 100 # 13.91 #detagsN <- rownames( topTags(de.com, n = 141)$table ) #cat("generate smear plot") #plotSmear(d, de.tags = detagsN, main = "FC plot, common dispersion") #abline(h = c(-2, 2), col = "dodgerblue") sessionInfo() cat("done") #> top141 #Comparison of groups: 1-0 # logConc logFC PValue FDR #hsa-mir-1269 -13.776 6.86965 1.132e-140 1.596e-138 #hsa-mir-136 -13.415 6.63290 1.095e-132 7.718e-131 #hsa-mir-431 -14.516 6.51740 7.859e-125 3.694e-123 #hsa-mir-127 -9.553 6.34624 4.816e-123 1.697e-121 #hsa-mir-381 -12.398 6.23639 8.383e-118 2.364e-116