# 01 Feb 2011
# G. Robertson
# Testing edgeR on miRNA-seq consensus clusters: pairwise
# k=6 consensus clusters for 173 LAML samples, q80 miRs
#
# Robinson MD, McCarthy DJ, Smyth GK. 
# edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.
# Bioinformatics. 2010 Jan 1;26(1):139-40. 

# source("/projects/remc_bigdata/TCGA/KIRC/20110728/chromophobe.edgeR.xhost09.20110728.R")

###################################################
### chunk number 1: poisson
###################################################
library(edgeR)
set.seed(101)

cat("\nread expression matrix\n")
mydata <- read.table(file="/projects/remc_bigdata/TCGA/KIRC/20110728/expn_matrix_passed_TCGA.txt", header = TRUE, row.names = 1, sep="\t", as.is=TRUE)
#mydata <- read.table(file="/projects/remc_bigdata/TCGA/LAML/differential_enrichment/20110211/COMBINED_expn_matrix_norm_passed_TCGA.txt", header = TRUE, row.names = 1, sep="\t")
dim(mydata)
nbr.cols <- ncol(mydata)

cat("calculate library.sizes\n")
# Need a library.sizes list. 
mydata.lib.sizes <- colSums(mydata)
# lib.sizes[[1]] is the sum of the first data column of 'mydata'

cat("read groups list\n")
#cat("read list of cluster assignments across the sample dendrogram\n")
# Need to assign consensus cluster numbers to each column of mydata
# From ConsensusClusterPlus, have cluster assignments. Use 'header=F' to avoid skipping the first row in the file.
# "TCGA.AB.2803",1
# "TCGA.AB.2807",2
# "TCGA.AB.2963",3
mydata.groups <- read.csv( file="/projects/remc_bigdata/TCGA/KIRC/20110728/groups.csv", header=F )
#mydata.groups <- read.table( file="/projects/remc_bigdata/TCGA/KIRC/20110728/groups.csv", header=F, sep=",", as.is=TRUE)
mydata.groups.matrix <- as.matrix(mydata.groups)
#consensusClass <- read.csv( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/LAML/data/miRNA-Seq/20110126_173-samples/consensus_clustering/no-star_km_spearman_q80Var_reps1000/no-star_km_spearman_q80Var_reps1000.k=6.consensusClass.csv", header=F)
#dim(consensusClass)  # 173, 2
#consensusClassClusterNbrs <- consensusClass[[2]]

#cat("set the cluster of interest\n")
#########################
#clusterOfInterest <- 1
#########################
# Use the next line to do the analysis on the cluster of interest vs. all other clusters
#mydata.groups <- as.integer( consensusClassClusterNbrs==clusterOfInterest )
# Use the next line to do the analysis on the other clusters vs. the cluster-of-interest
#mydata.groups <- as.integer( consensusClassClusterNbrs!=clusterOfInterest )
#########################

length(mydata.groups.matrix)
#pairwiseLabelsFn <- function(k, lis) { as.integer(lis==k) }

cat("load the DGEList data structure\n")
# This finishes almost instantly, because it's just setting up a data structure
d <- DGEList( counts=mydata, group=mydata.groups.matrix, lib.size=mydata.lib.sizes )
#dP$samples
#             group lib.size norm.factors
#TCGA.AB.2803     1 993556.1            1
#TCGA.AB.2807     0 991049.3            1
#...

cat("calculate normalization factor(s)\n")
# This takes only a second to run
d <- calcNormFactors(d)
#dP$samples
#             group lib.size norm.factors
#TCGA.AB.2803     1 993556.1    0.5241402
#TCGA.AB.2807     0 991049.3    1.5994537
#TCGA.AB.2963     0 995427.2    0.8469909
#TCGA.AB.2826     0 993389.5    0.8838622
#...


#cat("write MA plot\n")
#par( mfrow = c(1, 2) )
#maPlot( d$counts[, 1], d$counts[, 2], normalize = TRUE, pch = 19, cex = 0.4, ylim = c(-8, 8) )
#grid( col = "blue" )
#abline( h = log2(d$samples$norm.factors[2]/d$samples$norm.factors[1]), col = "red", lwd = 4 )
#eff.libsize <- d$samples$lib.size * d$samples$norm.factors
#maPlot( d$counts[, 1]/eff.libsize[1], d$counts[, 2]/eff.libsize[2], normalize = FALSE, pch = 19, cex = 0.4, ylim = c(-8, 8) )
#grid( col = "blue" )


cat("estimate common dispersion\n")
# common dispersion
d <- estimateCommonDisp(d)
names(d)
# [1] "samples"           "common.dispersion" "counts"            "pseudo.alt"        "genes"             "all.zeros"         "conc"             
# [8] "common.lib.size"
d$samples$lib.size
d$common.lib.size
# 993484.2
d$common.dispersion
# [1] 1.010957


cat("run exactTest(d)\n")
# Takes ~5 min to run
# Using common dispersion?
########################
de.com <- exactTest(d)
########################
names(de.com)
# "table" "comparison" "genes"
names(de.com$table)

options(digits = 4)
#topTags(de.com)
#Comparison of groups: 1-0 
#             logConc logFC     PValue        FDR
#hsa-mir-1269 -13.776 6.870 1.132e-140 1.596e-138
#hsa-mir-136  -13.415 6.633 1.095e-132 7.718e-131
#hsa-mir-431  -14.516 6.517 7.859e-125 3.694e-123
#hsa-mir-127   -9.553 6.346 4.816e-123 1.697e-121
#hsa-mir-381  -12.398 6.236 8.383e-118 2.364e-116
#hsa-mir-493  -13.745 6.246 1.782e-116 4.187e-115
#hsa-mir-654  -12.480 6.019 4.821e-109 9.710e-108
#hsa-mir-337  -13.968 6.016 3.126e-107 5.510e-106
#hsa-mir-379   -9.855 5.919 6.223e-106 9.749e-105
#hsa-mir-409  -13.555 5.865 4.194e-102 5.914e-101


#cat("add BH-corrected p-val to de.com$table\n")
#xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table, adjp=p.adjust(de$table$p.value, method="BH") )
#de.com$table <- cbind( de.com$table, adjp=p.adjust( de.com$table$p.value, method="BH") )

#topTags(de.com, sort.by = "logFC")
#tt <- topTags(de.com, sort.by = "pvalue")
#tt <- topTags(object, n=10, adjust.method="BH", sort.by="p.value")

#> d$counts[detags.com, ]
#Error in d$counts : $ operator is invalid for atomic vectors

sum(de.com$table$p.value < 0.01)
# 43
topN <- topTags( de.com, n = 1000, adjust.method="BH", sort.by="p.value" )
#names(topN)
# "table"         "adjust.method" "comparison"

cat("write out the pval-sorted results table\n")
write.table( topN$table, "/projects/remc_bigdata/TCGA/KIRC/20110728/KIRC-chromophobe.all.20110728_12h15.txt",sep="\t", row.names=T, quote=FALSE )

#nbrPosLogFC <- sum(topN$table$logFC > 0)
# 44
#nbrMegLogFC <- sum(topN$table$logFC < 0)
# 97
#sum( p.adjust(de.com$table$p.value, method = "BH") < 0.05 )

# 45 (38 have p.adj < 0.01)
#mean( p.adjust(de.com$table$p.value, method = "BH") < 0.05 ) * 100
# 13.91

#detagsN <- rownames( topTags(de.com, n = 141)$table )

#cat("generate smear plot")
#plotSmear(d, de.tags = detagsN, main = "FC plot, common dispersion")
#abline(h = c(-2, 2), col = "dodgerblue")


sessionInfo()

cat("done")
#> top141
#Comparison of groups: 1-0 
#               logConc    logFC     PValue        FDR
#hsa-mir-1269   -13.776  6.86965 1.132e-140 1.596e-138
#hsa-mir-136    -13.415  6.63290 1.095e-132 7.718e-131
#hsa-mir-431    -14.516  6.51740 7.859e-125 3.694e-123
#hsa-mir-127     -9.553  6.34624 4.816e-123 1.697e-121
#hsa-mir-381    -12.398  6.23639 8.383e-118 2.364e-116