# 13 Feb 2011 # G. Robertson # Testing edgeR on miRNA-seq consensus clusters: pairwise # # Robinson MD, McCarthy DJ, Smyth GK. # edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. # Bioinformatics. 2010 Jan 1;26(1):139-40. # source("/projects/remc_bigdata/TCGA/CORE/differential_abundance/CORE.edgeR.consensus-clustering.xhost09.20110213.R") ################################################### ### chunk number 1: poisson ################################################### library(edgeR) set.seed(101) cat("\nread expression matrix\n") # compare all COAD and READ to all other samples mydata <- read.table( file="/projects/remc_bigdata/TCGA/CORE/differential_abundance/expn_matrix_norm_passed.star.CORE.q80_variance.142-miRs-no-stars.forClustering.20110121.txt", header = TRUE, row.names = 1, sep="\t" ) #mydata <- read.table( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/miRNA-seq/20110211/expn_matrix_norm_passed_TCGA.combined.star.q80_141-miRs.20110211.txt", header = TRUE, row.names = 1, sep="\t" ) # compare cluster 2 to cluster 1 #mydata <- read.table(file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/expression-matrix.clusters-2-and-1.k7.20110213.txt", header = TRUE, row.names = 1, sep="\t") # compare READ to COAD in cluster 1 #mydata <- read.table(file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/expression-matrix.cluster-1.k7.20110213.txt", header = TRUE, row.names = 1, sep="\t") # compare READ to COAD in cluster 2 #mydata <- read.table(file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/expression-matrix.cluster-2.k7.20110213.txt", header = TRUE, row.names = 1, sep="\t") mydata.matrix <- as.matrix(mydata) dim(mydata.matrix) nbr.cols <- ncol(mydata) cat("calculate library.sizes\n") mydata.lib.sizes <- colSums(mydata.matrix) dim(mydata.lib.sizes) cat("read list of cluster assignments across the sample dendrogram\n") # compare all COAD and READ to all other samples groups <- read.csv( file="/projects/remc_bigdata/TCGA/CORE/differential_abundance/group-list_all-COADandREAD-vs-other.k7.20110213.txt", header=F ) # compare cluster 2 to cluster 1 #groups <- read.csv( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/group-list_cluster-2-vs-cluster-1.k7.20110213.txt", header=F ) # compare READ to COAD in cluster 1 #groups <- read.csv( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/group-list_cluster-1_RE=1-vs-CO=0.k7.20110213.txt", header=F ) # compare READ to COAD in cluster 2 #groups <- read.csv( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/group-list_cluster-2_RE=1-vs-CO=0.k7.20110213.txt", header=F ) mydata.groups<-as.matrix(groups) dim(mydata.groups) length(mydata.groups) cat("load the DGEList data structure\n") # This finishes almost instantly, because it's just setting up a data structure d <- DGEList( counts=mydata, group=mydata.groups, lib.size=mydata.lib.sizes ) #dP$samples # group lib.size norm.factors #TCGA.AB.2803 1 993556.1 1 #TCGA.AB.2807 0 991049.3 1 #... cat("calculate normalization factor(s)\n") # This takes only a second to run d <- calcNormFactors(d) #dP$samples # group lib.size norm.factors #TCGA.AB.2803 1 993556.1 0.5241402 #TCGA.AB.2807 0 991049.3 1.5994537 #TCGA.AB.2963 0 995427.2 0.8469909 #TCGA.AB.2826 0 993389.5 0.8838622 #... cat("write MA plot\n") par( mfrow = c(1, 2) ) maPlot( d$counts[, 1], d$counts[, 2], normalize = TRUE, pch = 19, cex = 0.4, ylim = c(-8, 8) ) grid( col = "blue" ) abline( h = log2(d$samples$norm.factors[2]/d$samples$norm.factors[1]), col = "red", lwd = 4 ) eff.libsize <- d$samples$lib.size * d$samples$norm.factors maPlot( d$counts[, 1]/eff.libsize[1], d$counts[, 2]/eff.libsize[2], normalize = FALSE, pch = 19, cex = 0.4, ylim = c(-8, 8) ) grid( col = "blue" ) cat("estimate common dispersion\n") # common dispersion d <- estimateCommonDisp(d) names(d) # [1] "samples" "common.dispersion" "counts" "pseudo.alt" "genes" "all.zeros" "conc" # [8] "common.lib.size" d$samples$lib.size d$common.lib.size # 993484.2 d$common.dispersion # [1] 1.010957 cat("run exactTest(d)\n") # Takes ~5 min to run # Using common dispersion? ######################## de.com <- exactTest(d) ######################## names(de.com) # "table" "comparison" "genes" names(de.com$table) options(digits = 4) #cat("add BH-corrected p-val to de.com$table\n") #xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table, adjp=p.adjust(de$table$p.value, method="BH") ) #de.com$table <- cbind( de.com$table, adjp=p.adjust( de.com$table$p.value, method="BH") ) sum(de.com$table$p.value < 0.01) # 43 ############################################################################## cat("calculate BH-corrected pvals for topTags\n") topN <- topTags(de.com, n=141, adjust.method="BH", sort.by="p.value") #names(topN) # "table" "adjust.method" "comparison" ############################################################################## cat("write out the pval-sorted results table\n") # compare all COAD and READ to all other samples write.table( topN$table, "/projects/remc_bigdata/TCGA/CORE/differential_abundance/diff-abundance.COADandREAD-vs-other.k7.20110213.txt",sep="\t", row.names=T, quote=FALSE ) #write.table( topN$table, "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/diff-abundance.COADandREAD-vs-other.k7.20110213.txt",sep="\t", row.names=T, quote=FALSE ) # compare cluster 2 to cluster 1 #write.table( topN$table, "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/diff-abundance.k7.cluster-2-vs-cluster-1.edgeR.20110213.txt",sep="\t", row.names=T, quote=FALSE ) # compare READ to COAD in cluster 1 #write.table( topN$table, "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/diff-abundance.k7.cluster-1_READ-vs-COAD.edgeR.20110213.txt",sep="\t", row.names=T, quote=FALSE ) # compare READ to COAD in cluster 2 #write.table( topN$table, "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/diff-abundance.k7.cluster-2_READ-vs-COAD.edgeR.20110213.txt",sep="\t", row.names=T, quote=FALSE ) ############################################################################## nbrPosLogFC <- sum(topN$table$logFC > 0) cat(nbrPosLogFC,"\n") # 44 nbrNegLogFC <- sum(topN$table$logFC < 0) cat(nbrNegLogFC,"\n") # 97 sum( p.adjust(de.com$table$p.value, method = "BH") < 0.05 ) # 45 (38 have p.adj < 0.01) mean( p.adjust(de.com$table$p.value, method = "BH") < 0.05 ) * 100 # 13.91 detagsN <- rownames( topTags(de.com, n = 141)$table ) cat("generate smear plot") plotSmear(d, de.tags = detagsN, main = "FC plot using common dispersion") abline(h = c(-2, 2), col = "dodgerblue") sessionInfo() cat("done") #> top141 #Comparison of groups: 1-0 # logConc logFC PValue FDR #hsa-mir-1269 -13.776 6.86965 1.132e-140 1.596e-138 #hsa-mir-136 -13.415 6.63290 1.095e-132 7.718e-131 #hsa-mir-431 -14.516 6.51740 7.859e-125 3.694e-123 #hsa-mir-127 -9.553 6.34624 4.816e-123 1.697e-121 #hsa-mir-381 -12.398 6.23639 8.383e-118 2.364e-116