# 17 Dec 2010
# From Mark Robinson, 18 October 2010
# Ongoing help from Mark...

# Run this script by e.g.
# source(file="/projects/remc_bigdata/Karsan/R/20101218/bammap2counts_DE-H4ac.20101218.R")

# source("http://bioconductor.org/biocLite.R")
# old.packages(repos=biocinstallRepos())
# update.packages(repos=biocinstallRepos(), ask=FALSE)

#install.packages("Repitools",repos="http://r-forge.r-project.org",type="source")
# biocLite("Rsamtools") 1.2.0

# biocLite("BSgenome.Hsapiens.UCSC.hg18") 834 M as a tar.gz
# biocLite("BSgenome.Mmusculus.UCSC.mm8")  747MB
# R CMD REMOVE BSgenome.Hsapiens.UCSC.hg18
# R CMD INSTALL /projects/remc_bigdata/Data/BSgenome.Hsapiens.UCSC.hg18_1.3.16.tar.gz

# load packages
#library(IRanges)
#library(Biostrings)
#library(GenomicRanges)
#library(IRanges)

library(Rsamtools)
library(Repitools)
library(edgeR)
library(BSgenome.Hsapiens.UCSC.hg18)


cat("some parameter settings\n")
# need to add a failed-chastity filter and a MAPQ filter
#p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE))
p <- ScanBamParam(what=c( "rname", "strand", "pos", "mapq" ), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE))

fragSize <- 200

# 1000, 750, 500 bp windows
#windowSize <- 2500
windowSize <- 1000
#windowSize <- 750
#windowSize <- 500

cat("fixed bins:",windowSize,"bp; fragSize:",fragSize,"bp\n")
# human
chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "")
#print(paste("w=",windowSize,sep="")
#print(paste("fragSize=",fragSize,sep=""))

# file names (here, BAM format) --> list of GRanges
# HS1238 is 50 bp, HS1235 is 75 bp, both are SE reads
# Compare HS1238 then HS1235
##############################################################################
#readLen <- c(50,75)  # vector to match filenames
#filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam",
#               "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam")

##############################################################################
# 16 Dec 2010: 4 lanes of data
#readLen <- c(50,75,50,50)  # vector to match filenames
#filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam",
#               "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam",
#               "/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam",
#               "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam" )
##############################################################################
# 18 Dec 2010 - only lane-2 data
readLen <- c(50,50)  # vector to match filenames
filenames <- c("/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam",
               "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam")
##############################################################################

# A multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, 
# the second elements, the third elements, and so on. Arguments are recycled if necessary.

cat("read in .bam files, filter reads\n")
##############################################################################
#gr <- mapply(FUN=function(u,v) {
#  sb <- scanBam(u, param=p)[[1]]
#  GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand)
#},filenames,readLen)

##############################################################################
# 16 Dec, Mark Robinson: add MAPQ filter here
gr <- mapply(FUN=function(u,v) {
  sb <- scanBam(u, param=p)[[1]]
  w <- sb$mapq > 10  # SET CUTOFF HERE
  cat(u, ": Read", length(sb$rname), "mapped non-dup. reads,", round(mean(w)*100,2), "percent with MAPQ>10\n")
  GRanges( seqnames=paste("chr",sb$rname[w],sep=""), ranges=IRanges(start=sb$pos[w],width=v), strand=sb$strand[w] )
  },filenames,readLen)
##############################################################################
#names(gr) <- c("HS1238_kd","HS1235_ctl","HS1238_kd_2","HS1235_ctl_2")
#names(gr) <- c("HS1238_kd","HS1235_ctl")  # lane 1
names(gr) <- c("HS1238_kd_2","HS1235_ctl_2")  # lane 2
grl <- GRangesList(gr)

cat("create table of windows along the genome\n")
windows <- genomeBlocks(Hsapiens, chrNames, windowSize)

head(windows)
#RangedData with 6 rows and 0 value columns across 24 spaces


cat("create count table after extending reads to be length fragSize\n")
counts <- annotationBlocksCounts(grl, windows, fragSize, verbose=TRUE)
# MR: If colSums are 0, we'll have to fix the chromosome names problem.
colSums(counts)
# HS1238_kd HS1235_ctl 
#  19563732   26082144

cat("filter on the total count for a bin (10)\n")
k <- rowSums(counts) > 10  

cat("Get normalizing factors for edgeR\n")
nf <- calcNormFactors(counts[k,])

### Original way #############################################################
# 18 Dec 2010
cat("estimate common dispersion as 'd'\n")
d <- DGEList(counts=counts[k,], 
	         group=colnames(counts), 
             lib.size=colSums(counts[k,])*nf,
             genes=as.data.frame(windows)[k,1:3])
d <- estimateCommonDisp(d)  # in my example, no bio replicates, will get warning
cat(windowSize,"bp bins: d$common.dispersion=",d$common.dispersion,"\n")


##############################################################################
# Mark Robinson, 08 December 2010 - estimating a dispersion when you have no replicates
#print("estimate common dispersion as 'dhack'")
#dhack <- d <- DGEList(counts=counts[k,], 
#	         group=colnames(counts), 
#             lib.size=colSums(counts[k,])*f,
#             genes=as.data.frame(windows)[k,1:3])
#dhack$samples$group <- factor(c("A","A"))
#dhack <- estimateCommonDisp(dhack)
#d <- estimateCommonDisp(d)  # in my example, no bio replicates, will get warning
#d$common.dispersion <- dhack$common.dispersion
#cat(windowSize,"bp bins: dhack$common.dispersion=",dhack$common.dispersion,"\n")


##############################################################################
### 4 lanes, as replicates, 16 Dec 2010  #####################################
#cat("\n","process as replicates","\n")
#d.rep <- DGEList(counts=counts[k,], 
#            group=c("HS1238_kd","HS1235_ctl","HS1238_kd","HS1235_ctl"), 
#            lib.size=colSums(counts[k,])*f,
#            genes=as.data.frame(windows)[k,1:3])
#            #genes=anno[k,1:3])
#d.rep <- estimateCommonDisp(d.rep)
#cat("d.rep$common.dispersion=",d.rep$common.dispersion,"\n")  # this can be estimated now, but should be v. small - d.rep$common.dispersion= 0.06875074
##############################################################################


cat("exactTest()\n")
de <- exactTest(d, pair=c("HS1235_ctl","HS1238_kd"))
topTags(de)

xx <- cbind(as.data.frame(windows)[k,-5], counts[k,], de$table,adjp=p.adjust(de$table$p.value, method="BH") )
#xx <- cbind(anno[k,-5], counts[k,], id=rownames(de.rep$table), de.rep$table,adjp=p.adjust(de.rep$table$p.value, method="BH") )

cat("write.table, all records, regardless of p-val\n")
write.table(xx, "/projects/remc_bigdata/Karsan/R/20101220/HS1238_vs_1235.lane-2-only.mapq10-noDups.w1000.edgeR.all.20101220.txt", sep="\t", row.names=FALSE, quote=FALSE)

#cat("save.image()\n")
#save.image("Robjects_w1000_2010-12-18-Vancouver.Rdata") # save all objects in a single file


### Saturation analysis ######################################################
# Mark Robinson, 20 Dec 2010

downSamp <- function(d, fr=.5) {
  g <- rownames(d)
  newd <- matrix(0,nr=nrow(d), nc=ncol(d), dimnames=dimnames(d))
  for(j in 1:ncol(d)) {
    cat(".")
    s <- sample( rep(g,d[,j]), round(fr*sum(d[,j])) )
    #s <- sample(g, f*sum(d[,j]), prob=d[,j], replace=TRUE)
    ts <- table(s)
    newd[names(ts),j] <- ts
  }
  cat("\n")
  newd
}

print("do subsampling analysis (downsampling counts).")
nSamps <- 10              # choose number of samples to take at each proportion
subs <- seq( 0.3, 1, by=0.05 )  # choose resolution here
names(subs) <- subs

dir <- nDE <- lapply(subs, function(u) matrix(0,nrow=nrow(d$counts),ncol=nSamps))

print(date())
for(i in 1:length(subs)) {
  for(j in 1:nSamps) {
    DD <- downSamp(d$counts, subs[i])
    nfSub <- calcNormFactors(DD)
    dSub <- DGEList(counts=DD, group=colnames(counts),
                    lib.size=colSums(DD)*nfSub)
    dSub <- estimateCommonDisp(dSub)
    ### set this to select lane-1 or lane-2
    #deSub <- exactTest(dSub,pair=c("HS1235_ctl","HS1238_kd"))
    deSub <- exactTest(dSub,pair=c("HS1235_ctl_2","HS1238_kd_2"))
    ###
    nDE[[i]][,j] <- deSub$table$p.value
    dir[[i]][,j] <- sign(deSub$table$logFC)
  }
  cat("------ Downsampling at", subs[i],"finished ----------------------\n")
}
print(date())


##############################################################################
print("generate saturation boxplot for FDR<0.05")
fdr <- 0.05;
z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)

pdf("/projects/remc_bigdata/Karsan/R/20101220/lane2.downSampling-counts.w1000.adjP-LE-0p05.20101220.pdf",w=6,h=6);
boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
        xlab="Percent of dataset",
        ylab="Number of TSS regions with adjP<0.05",
        names=NULL, xaxt="n",
        main="-2000/+1000bp mrgd UCSC gene TSSs (L2)",
        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
axis(1)
grid(lty="dashed")
legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
dev.off()