# 13 Feb 2011
# G. Robertson
# Testing edgeR on miRNA-seq consensus clusters: pairwise
#
# Robinson MD, McCarthy DJ, Smyth GK. 
# edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.
# Bioinformatics. 2010 Jan 1;26(1):139-40. 

# source("/projects/remc_bigdata/TCGA/CORE/differential_abundance/CORE.edgeR.consensus-clustering.xhost09.20110213.R")

###################################################
### chunk number 1: poisson
###################################################
library(edgeR)
set.seed(101)

cat("\nread expression matrix\n")
# compare all COAD and READ to all other samples
mydata <- read.table( file="/projects/remc_bigdata/TCGA/CORE/differential_abundance/expn_matrix_norm_passed.star.CORE.q80_variance.142-miRs-no-stars.forClustering.20110121.txt", header = TRUE, row.names = 1, sep="\t" )
#mydata <- read.table( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/miRNA-seq/20110211/expn_matrix_norm_passed_TCGA.combined.star.q80_141-miRs.20110211.txt", header = TRUE, row.names = 1, sep="\t" )

# compare cluster 2 to cluster 1
#mydata <- read.table(file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/expression-matrix.clusters-2-and-1.k7.20110213.txt", header = TRUE, row.names = 1, sep="\t")
# compare READ to COAD in cluster 1
#mydata <- read.table(file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/expression-matrix.cluster-1.k7.20110213.txt", header = TRUE, row.names = 1, sep="\t")
# compare READ to COAD in cluster 2
#mydata <- read.table(file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/expression-matrix.cluster-2.k7.20110213.txt", header = TRUE, row.names = 1, sep="\t")


mydata.matrix <- as.matrix(mydata)
dim(mydata.matrix)
nbr.cols <- ncol(mydata)

cat("calculate library.sizes\n")
mydata.lib.sizes <- colSums(mydata.matrix)
dim(mydata.lib.sizes)

cat("read list of cluster assignments across the sample dendrogram\n")
# compare all COAD and READ to all other samples
groups <- read.csv( file="/projects/remc_bigdata/TCGA/CORE/differential_abundance/group-list_all-COADandREAD-vs-other.k7.20110213.txt", header=F )

# compare cluster 2 to cluster 1
#groups <- read.csv( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/group-list_cluster-2-vs-cluster-1.k7.20110213.txt", header=F )
# compare READ to COAD in cluster 1
#groups <- read.csv( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/group-list_cluster-1_RE=1-vs-CO=0.k7.20110213.txt", header=F )
# compare READ to COAD in cluster 2
#groups <- read.csv( file="/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/group-list_cluster-2_RE=1-vs-CO=0.k7.20110213.txt", header=F )

mydata.groups<-as.matrix(groups)
dim(mydata.groups)

length(mydata.groups)

cat("load the DGEList data structure\n")
# This finishes almost instantly, because it's just setting up a data structure
d <- DGEList( counts=mydata, group=mydata.groups, lib.size=mydata.lib.sizes )
#dP$samples
#             group lib.size norm.factors
#TCGA.AB.2803     1 993556.1            1
#TCGA.AB.2807     0 991049.3            1
#...

cat("calculate normalization factor(s)\n")
# This takes only a second to run
d <- calcNormFactors(d)
#dP$samples
#             group lib.size norm.factors
#TCGA.AB.2803     1 993556.1    0.5241402
#TCGA.AB.2807     0 991049.3    1.5994537
#TCGA.AB.2963     0 995427.2    0.8469909
#TCGA.AB.2826     0 993389.5    0.8838622
#...

cat("write MA plot\n")
par( mfrow = c(1, 2) )
maPlot( d$counts[, 1], d$counts[, 2], normalize = TRUE, pch = 19, cex = 0.4, ylim = c(-8, 8) )
grid( col = "blue" )
abline( h = log2(d$samples$norm.factors[2]/d$samples$norm.factors[1]), col = "red", lwd = 4 )
eff.libsize <- d$samples$lib.size * d$samples$norm.factors
maPlot( d$counts[, 1]/eff.libsize[1], d$counts[, 2]/eff.libsize[2], normalize = FALSE, pch = 19, cex = 0.4, ylim = c(-8, 8) )
grid( col = "blue" )


cat("estimate common dispersion\n")
# common dispersion
d <- estimateCommonDisp(d)
names(d)
# [1] "samples"           "common.dispersion" "counts"            "pseudo.alt"        "genes"             "all.zeros"         "conc"             
# [8] "common.lib.size"
d$samples$lib.size
d$common.lib.size
# 993484.2
d$common.dispersion
# [1] 1.010957


cat("run exactTest(d)\n")
# Takes ~5 min to run
# Using common dispersion?
########################
de.com <- exactTest(d)
########################
names(de.com)
# "table" "comparison" "genes"
names(de.com$table)

options(digits = 4)
#cat("add BH-corrected p-val to de.com$table\n")
#xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table, adjp=p.adjust(de$table$p.value, method="BH") )
#de.com$table <- cbind( de.com$table, adjp=p.adjust( de.com$table$p.value, method="BH") )

sum(de.com$table$p.value < 0.01)
# 43

##############################################################################
cat("calculate BH-corrected pvals for topTags\n")
topN <- topTags(de.com, n=141, adjust.method="BH", sort.by="p.value")
#names(topN)
# "table"         "adjust.method" "comparison"
##############################################################################
cat("write out the pval-sorted results table\n")
# compare all COAD and READ to all other samples
write.table( topN$table, "/projects/remc_bigdata/TCGA/CORE/differential_abundance/diff-abundance.COADandREAD-vs-other.k7.20110213.txt",sep="\t", row.names=T, quote=FALSE )
#write.table( topN$table, "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/diff-abundance.COADandREAD-vs-other.k7.20110213.txt",sep="\t", row.names=T, quote=FALSE )

# compare cluster 2 to cluster 1
#write.table( topN$table, "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/diff-abundance.k7.cluster-2-vs-cluster-1.edgeR.20110213.txt",sep="\t", row.names=T, quote=FALSE )
# compare READ to COAD in cluster 1
#write.table( topN$table, "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/diff-abundance.k7.cluster-1_READ-vs-COAD.edgeR.20110213.txt",sep="\t", row.names=T, quote=FALSE )
# compare READ to COAD in cluster 2
#write.table( topN$table, "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/Integrated_tumors/consensus_clustering/20110212/no-star_hc_spearman_q80Var_reps1000/differential_abundance/diff-abundance.k7.cluster-2_READ-vs-COAD.edgeR.20110213.txt",sep="\t", row.names=T, quote=FALSE )


##############################################################################

nbrPosLogFC <- sum(topN$table$logFC > 0)
cat(nbrPosLogFC,"\n")
# 44
nbrNegLogFC <- sum(topN$table$logFC < 0)
cat(nbrNegLogFC,"\n")
# 97
sum( p.adjust(de.com$table$p.value, method = "BH") < 0.05 )

# 45 (38 have p.adj < 0.01)
mean( p.adjust(de.com$table$p.value, method = "BH") < 0.05 ) * 100
# 13.91

detagsN <- rownames( topTags(de.com, n = 141)$table )

cat("generate smear plot")
plotSmear(d, de.tags = detagsN, main = "FC plot using common dispersion")
abline(h = c(-2, 2), col = "dodgerblue")


sessionInfo()

cat("done")
#> top141
#Comparison of groups: 1-0 
#               logConc    logFC     PValue        FDR
#hsa-mir-1269   -13.776  6.86965 1.132e-140 1.596e-138
#hsa-mir-136    -13.415  6.63290 1.095e-132 7.718e-131
#hsa-mir-431    -14.516  6.51740 7.859e-125 3.694e-123
#hsa-mir-127     -9.553  6.34624 4.816e-123 1.697e-121
#hsa-mir-381    -12.398  6.23639 8.383e-118 2.364e-116